Files
aitbc/cli/aitbc_cli/core/deployment.py
oib ccedbace53 chore: remove configuration files and enhance blockchain explorer with advanced search, analytics, and export features
- Delete .aitbc.yaml.example CLI configuration template
- Delete .lycheeignore link checker exclusion rules
- Delete .nvmrc Node.js version specification
- Add advanced search panel with filters for address, amount range, transaction type, time range, and validator
- Add analytics dashboard with transaction volume, active addresses, and block time metrics
- Add Chart.js integration
2026-03-02 15:38:25 +01:00

653 lines
25 KiB
Python

"""
Production deployment and scaling system
"""
import asyncio
import json
import subprocess
import shutil
from pathlib import Path
from datetime import datetime, timedelta
from typing import Dict, List, Optional, Any, Tuple
from dataclasses import dataclass, asdict
from enum import Enum
import uuid
import os
import sys
class DeploymentStatus(Enum):
"""Deployment status"""
PENDING = "pending"
DEPLOYING = "deploying"
RUNNING = "running"
FAILED = "failed"
STOPPED = "stopped"
SCALING = "scaling"
class ScalingPolicy(Enum):
"""Scaling policies"""
MANUAL = "manual"
AUTO = "auto"
SCHEDULED = "scheduled"
LOAD_BASED = "load_based"
@dataclass
class DeploymentConfig:
"""Deployment configuration"""
deployment_id: str
name: str
environment: str
region: str
instance_type: str
min_instances: int
max_instances: int
desired_instances: int
scaling_policy: ScalingPolicy
health_check_path: str
port: int
ssl_enabled: bool
domain: str
database_config: Dict[str, Any]
monitoring_enabled: bool
backup_enabled: bool
auto_scaling_enabled: bool
created_at: datetime
updated_at: datetime
@dataclass
class DeploymentMetrics:
"""Deployment performance metrics"""
deployment_id: str
cpu_usage: float
memory_usage: float
disk_usage: float
network_in: float
network_out: float
request_count: int
error_rate: float
response_time: float
uptime_percentage: float
active_instances: int
last_updated: datetime
@dataclass
class ScalingEvent:
"""Scaling event record"""
event_id: str
deployment_id: str
scaling_type: str
old_instances: int
new_instances: int
trigger_reason: str
triggered_at: datetime
completed_at: Optional[datetime]
success: bool
metadata: Dict[str, Any]
class ProductionDeployment:
"""Production deployment and scaling system"""
def __init__(self, config_path: str = "/home/oib/windsurf/aitbc"):
self.config_path = Path(config_path)
self.deployments: Dict[str, DeploymentConfig] = {}
self.metrics: Dict[str, DeploymentMetrics] = {}
self.scaling_events: List[ScalingEvent] = []
self.health_checks: Dict[str, bool] = {}
# Deployment paths
self.deployment_dir = self.config_path / "deployments"
self.config_dir = self.config_path / "config"
self.logs_dir = self.config_path / "logs"
self.backups_dir = self.config_path / "backups"
# Ensure directories exist
self.config_path.mkdir(parents=True, exist_ok=True)
self.deployment_dir.mkdir(parents=True, exist_ok=True)
self.config_dir.mkdir(parents=True, exist_ok=True)
self.logs_dir.mkdir(parents=True, exist_ok=True)
self.backups_dir.mkdir(parents=True, exist_ok=True)
# Scaling thresholds
self.scaling_thresholds = {
'cpu_high': 80.0,
'cpu_low': 20.0,
'memory_high': 85.0,
'memory_low': 30.0,
'error_rate_high': 5.0,
'response_time_high': 2000.0, # ms
'min_uptime': 99.0
}
async def create_deployment(self, name: str, environment: str, region: str,
instance_type: str, min_instances: int, max_instances: int,
desired_instances: int, port: int, domain: str,
database_config: Dict[str, Any]) -> Optional[str]:
"""Create a new deployment configuration"""
try:
deployment_id = str(uuid.uuid4())
deployment = DeploymentConfig(
deployment_id=deployment_id,
name=name,
environment=environment,
region=region,
instance_type=instance_type,
min_instances=min_instances,
max_instances=max_instances,
desired_instances=desired_instances,
scaling_policy=ScalingPolicy.AUTO,
health_check_path="/health",
port=port,
ssl_enabled=True,
domain=domain,
database_config=database_config,
monitoring_enabled=True,
backup_enabled=True,
auto_scaling_enabled=True,
created_at=datetime.now(),
updated_at=datetime.now()
)
self.deployments[deployment_id] = deployment
# Create deployment directory structure
deployment_path = self.deployment_dir / deployment_id
deployment_path.mkdir(exist_ok=True)
# Generate deployment configuration files
await self._generate_deployment_configs(deployment, deployment_path)
return deployment_id
except Exception as e:
print(f"Error creating deployment: {e}")
return None
async def deploy_application(self, deployment_id: str) -> bool:
"""Deploy the application to production"""
try:
deployment = self.deployments.get(deployment_id)
if not deployment:
return False
print(f"Starting deployment of {deployment.name} ({deployment_id})")
# 1. Build application
build_success = await self._build_application(deployment)
if not build_success:
return False
# 2. Deploy infrastructure
infra_success = await self._deploy_infrastructure(deployment)
if not infra_success:
return False
# 3. Configure monitoring
monitoring_success = await self._setup_monitoring(deployment)
if not monitoring_success:
return False
# 4. Start health checks
await self._start_health_checks(deployment)
# 5. Initialize metrics collection
await self._initialize_metrics(deployment_id)
print(f"Deployment {deployment_id} completed successfully")
return True
except Exception as e:
print(f"Error deploying application: {e}")
return False
async def scale_deployment(self, deployment_id: str, target_instances: int,
reason: str = "manual") -> bool:
"""Scale a deployment to target instance count"""
try:
deployment = self.deployments.get(deployment_id)
if not deployment:
return False
# Validate scaling limits
if target_instances < deployment.min_instances or target_instances > deployment.max_instances:
return False
old_instances = deployment.desired_instances
# Create scaling event
scaling_event = ScalingEvent(
event_id=str(uuid.uuid4()),
deployment_id=deployment_id,
scaling_type="manual" if reason == "manual" else "auto",
old_instances=old_instances,
new_instances=target_instances,
trigger_reason=reason,
triggered_at=datetime.now(),
completed_at=None,
success=False,
metadata={"deployment_name": deployment.name}
)
self.scaling_events.append(scaling_event)
# Update deployment
deployment.desired_instances = target_instances
deployment.updated_at = datetime.now()
# Execute scaling
scaling_success = await self._execute_scaling(deployment, target_instances)
# Update scaling event
scaling_event.completed_at = datetime.now()
scaling_event.success = scaling_success
if scaling_success:
print(f"Scaled deployment {deployment_id} from {old_instances} to {target_instances} instances")
else:
# Rollback on failure
deployment.desired_instances = old_instances
print(f"Scaling failed, rolled back to {old_instances} instances")
return scaling_success
except Exception as e:
print(f"Error scaling deployment: {e}")
return False
async def auto_scale_deployment(self, deployment_id: str) -> bool:
"""Automatically scale deployment based on metrics"""
try:
deployment = self.deployments.get(deployment_id)
if not deployment or not deployment.auto_scaling_enabled:
return False
metrics = self.metrics.get(deployment_id)
if not metrics:
return False
current_instances = deployment.desired_instances
new_instances = current_instances
# Scale up conditions
scale_up_triggers = []
if metrics.cpu_usage > self.scaling_thresholds['cpu_high']:
scale_up_triggers.append(f"CPU usage high: {metrics.cpu_usage:.1f}%")
if metrics.memory_usage > self.scaling_thresholds['memory_high']:
scale_up_triggers.append(f"Memory usage high: {metrics.memory_usage:.1f}%")
if metrics.error_rate > self.scaling_thresholds['error_rate_high']:
scale_up_triggers.append(f"Error rate high: {metrics.error_rate:.1f}%")
# Scale down conditions
scale_down_triggers = []
if (metrics.cpu_usage < self.scaling_thresholds['cpu_low'] and
metrics.memory_usage < self.scaling_thresholds['memory_low'] and
current_instances > deployment.min_instances):
scale_down_triggers.append("Low resource usage")
# Execute scaling
if scale_up_triggers and current_instances < deployment.max_instances:
new_instances = min(current_instances + 1, deployment.max_instances)
reason = f"Auto scale up: {', '.join(scale_up_triggers)}"
return await self.scale_deployment(deployment_id, new_instances, reason)
elif scale_down_triggers and current_instances > deployment.min_instances:
new_instances = max(current_instances - 1, deployment.min_instances)
reason = f"Auto scale down: {', '.join(scale_down_triggers)}"
return await self.scale_deployment(deployment_id, new_instances, reason)
return True
except Exception as e:
print(f"Error in auto-scaling: {e}")
return False
async def get_deployment_status(self, deployment_id: str) -> Optional[Dict[str, Any]]:
"""Get comprehensive deployment status"""
try:
deployment = self.deployments.get(deployment_id)
if not deployment:
return None
metrics = self.metrics.get(deployment_id)
health_status = self.health_checks.get(deployment_id, False)
# Get recent scaling events
recent_events = [
event for event in self.scaling_events
if event.deployment_id == deployment_id and
event.triggered_at >= datetime.now() - timedelta(hours=24)
]
status = {
"deployment": asdict(deployment),
"metrics": asdict(metrics) if metrics else None,
"health_status": health_status,
"recent_scaling_events": [asdict(event) for event in recent_events[-5:]],
"uptime_percentage": metrics.uptime_percentage if metrics else 0.0,
"last_updated": datetime.now().isoformat()
}
return status
except Exception as e:
print(f"Error getting deployment status: {e}")
return None
async def get_cluster_overview(self) -> Dict[str, Any]:
"""Get overview of all deployments"""
try:
total_deployments = len(self.deployments)
running_deployments = len([
d for d in self.deployments.values()
if self.health_checks.get(d.deployment_id, False)
])
total_instances = sum(d.desired_instances for d in self.deployments.values())
# Calculate aggregate metrics
aggregate_metrics = {
"total_cpu_usage": 0.0,
"total_memory_usage": 0.0,
"total_disk_usage": 0.0,
"average_response_time": 0.0,
"average_error_rate": 0.0,
"average_uptime": 0.0
}
active_metrics = [m for m in self.metrics.values()]
if active_metrics:
aggregate_metrics["total_cpu_usage"] = sum(m.cpu_usage for m in active_metrics) / len(active_metrics)
aggregate_metrics["total_memory_usage"] = sum(m.memory_usage for m in active_metrics) / len(active_metrics)
aggregate_metrics["total_disk_usage"] = sum(m.disk_usage for m in active_metrics) / len(active_metrics)
aggregate_metrics["average_response_time"] = sum(m.response_time for m in active_metrics) / len(active_metrics)
aggregate_metrics["average_error_rate"] = sum(m.error_rate for m in active_metrics) / len(active_metrics)
aggregate_metrics["average_uptime"] = sum(m.uptime_percentage for m in active_metrics) / len(active_metrics)
# Recent scaling activity
recent_scaling = [
event for event in self.scaling_events
if event.triggered_at >= datetime.now() - timedelta(hours=24)
]
overview = {
"total_deployments": total_deployments,
"running_deployments": running_deployments,
"total_instances": total_instances,
"aggregate_metrics": aggregate_metrics,
"recent_scaling_events": len(recent_scaling),
"successful_scaling_rate": sum(1 for e in recent_scaling if e.success) / len(recent_scaling) if recent_scaling else 0.0,
"health_check_coverage": len(self.health_checks) / total_deployments if total_deployments > 0 else 0.0,
"last_updated": datetime.now().isoformat()
}
return overview
except Exception as e:
print(f"Error getting cluster overview: {e}")
return {}
async def _generate_deployment_configs(self, deployment: DeploymentConfig, deployment_path: Path):
"""Generate deployment configuration files"""
try:
# Generate systemd service file
service_content = f"""[Unit]
Description={deployment.name} Service
After=network.target
[Service]
Type=simple
User=aitbc
WorkingDirectory={self.config_path}
ExecStart=/usr/bin/python3 -m aitbc_cli.main --port {deployment.port}
Restart=always
RestartSec=10
Environment=PYTHONPATH={self.config_path}
Environment=DEPLOYMENT_ID={deployment.deployment_id}
Environment=ENVIRONMENT={deployment.environment}
[Install]
WantedBy=multi-user.target
"""
service_file = deployment_path / f"{deployment.name}.service"
with open(service_file, 'w') as f:
f.write(service_content)
# Generate nginx configuration
nginx_content = f"""upstream {deployment.name}_backend {{
server 127.0.0.1:{deployment.port};
}}
server {{
listen 80;
server_name {deployment.domain};
location / {{
proxy_pass http://{deployment.name}_backend;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
}}
location {deployment.health_check_path} {{
proxy_pass http://{deployment.name}_backend;
access_log off;
}}
}}
"""
nginx_file = deployment_path / f"{deployment.name}.nginx.conf"
with open(nginx_file, 'w') as f:
f.write(nginx_content)
# Generate monitoring configuration
monitoring_content = f"""# Monitoring configuration for {deployment.name}
deployment_id: {deployment.deployment_id}
name: {deployment.name}
environment: {deployment.environment}
port: {deployment.port}
health_check_path: {deployment.health_check_path}
metrics_interval: 30
alert_thresholds:
cpu_usage: {self.scaling_thresholds['cpu_high']}
memory_usage: {self.scaling_thresholds['memory_high']}
error_rate: {self.scaling_thresholds['error_rate_high']}
response_time: {self.scaling_thresholds['response_time_high']}
"""
monitoring_file = deployment_path / "monitoring.yml"
with open(monitoring_file, 'w') as f:
f.write(monitoring_content)
except Exception as e:
print(f"Error generating deployment configs: {e}")
async def _build_application(self, deployment: DeploymentConfig) -> bool:
"""Build the application for deployment"""
try:
print(f"Building application for {deployment.name}")
# Simulate build process
build_steps = [
"Installing dependencies...",
"Compiling application...",
"Running tests...",
"Creating deployment package...",
"Optimizing for production..."
]
for step in build_steps:
print(f" {step}")
await asyncio.sleep(0.5) # Simulate build time
print("Build completed successfully")
return True
except Exception as e:
print(f"Error building application: {e}")
return False
async def _deploy_infrastructure(self, deployment: DeploymentConfig) -> bool:
"""Deploy infrastructure components"""
try:
print(f"Deploying infrastructure for {deployment.name}")
# Deploy systemd service
service_file = self.deployment_dir / deployment.deployment_id / f"{deployment.name}.service"
system_service_path = Path("/etc/systemd/system") / f"{deployment.name}.service"
if service_file.exists():
shutil.copy2(service_file, system_service_path)
subprocess.run(["systemctl", "daemon-reload"], check=True)
subprocess.run(["systemctl", "enable", deployment.name], check=True)
subprocess.run(["systemctl", "start", deployment.name], check=True)
print(f" Service {deployment.name} started")
# Deploy nginx configuration
nginx_file = self.deployment_dir / deployment.deployment_id / f"{deployment.name}.nginx.conf"
nginx_config_path = Path("/etc/nginx/sites-available") / f"{deployment.name}.conf"
if nginx_file.exists():
shutil.copy2(nginx_file, nginx_config_path)
# Enable site
sites_enabled = Path("/etc/nginx/sites-enabled")
site_link = sites_enabled / f"{deployment.name}.conf"
if not site_link.exists():
site_link.symlink_to(nginx_config_path)
subprocess.run(["nginx", "-t"], check=True)
subprocess.run(["systemctl", "reload", "nginx"], check=True)
print(f" Nginx configuration updated")
print("Infrastructure deployment completed")
return True
except Exception as e:
print(f"Error deploying infrastructure: {e}")
return False
async def _setup_monitoring(self, deployment: DeploymentConfig) -> bool:
"""Set up monitoring for the deployment"""
try:
print(f"Setting up monitoring for {deployment.name}")
monitoring_file = self.deployment_dir / deployment.deployment_id / "monitoring.yml"
if monitoring_file.exists():
print(f" Monitoring configuration loaded")
print(f" Health checks enabled on {deployment.health_check_path}")
print(f" Metrics collection started")
print("Monitoring setup completed")
return True
except Exception as e:
print(f"Error setting up monitoring: {e}")
return False
async def _start_health_checks(self, deployment: DeploymentConfig):
"""Start health checks for the deployment"""
try:
print(f"Starting health checks for {deployment.name}")
# Initialize health status
self.health_checks[deployment.deployment_id] = True
# Start periodic health checks
asyncio.create_task(self._periodic_health_check(deployment))
except Exception as e:
print(f"Error starting health checks: {e}")
async def _periodic_health_check(self, deployment: DeploymentConfig):
"""Periodic health check for deployment"""
while True:
try:
# Simulate health check
await asyncio.sleep(30) # Check every 30 seconds
# Update health status (simulated)
self.health_checks[deployment.deployment_id] = True
# Update metrics
await self._update_metrics(deployment.deployment_id)
except Exception as e:
print(f"Error in health check for {deployment.name}: {e}")
self.health_checks[deployment.deployment_id] = False
async def _initialize_metrics(self, deployment_id: str):
"""Initialize metrics collection for deployment"""
try:
metrics = DeploymentMetrics(
deployment_id=deployment_id,
cpu_usage=0.0,
memory_usage=0.0,
disk_usage=0.0,
network_in=0.0,
network_out=0.0,
request_count=0,
error_rate=0.0,
response_time=0.0,
uptime_percentage=100.0,
active_instances=1,
last_updated=datetime.now()
)
self.metrics[deployment_id] = metrics
except Exception as e:
print(f"Error initializing metrics: {e}")
async def _update_metrics(self, deployment_id: str):
"""Update deployment metrics"""
try:
metrics = self.metrics.get(deployment_id)
if not metrics:
return
# Simulate metric updates (in production, these would be real metrics)
import random
metrics.cpu_usage = random.uniform(10, 70)
metrics.memory_usage = random.uniform(20, 80)
metrics.disk_usage = random.uniform(30, 60)
metrics.network_in = random.uniform(100, 1000)
metrics.network_out = random.uniform(50, 500)
metrics.request_count += random.randint(10, 100)
metrics.error_rate = random.uniform(0, 2)
metrics.response_time = random.uniform(50, 500)
metrics.uptime_percentage = random.uniform(99.0, 100.0)
metrics.last_updated = datetime.now()
except Exception as e:
print(f"Error updating metrics: {e}")
async def _execute_scaling(self, deployment: DeploymentConfig, target_instances: int) -> bool:
"""Execute scaling operation"""
try:
print(f"Executing scaling to {target_instances} instances")
# Simulate scaling process
scaling_steps = [
f"Provisioning {target_instances - deployment.desired_instances} new instances...",
"Configuring new instances...",
"Load balancing configuration...",
"Health checks on new instances...",
"Traffic migration..."
]
for step in scaling_steps:
print(f" {step}")
await asyncio.sleep(1) # Simulate scaling time
print("Scaling completed successfully")
return True
except Exception as e:
print(f"Error executing scaling: {e}")
return False