aitbc/cli/aitbc_cli/core/deployment.py

"""
Production deployment and scaling system
"""

import asyncio
import json
import subprocess
import shutil
from pathlib import Path
from datetime import datetime, timedelta
from typing import Dict, List, Optional, Any, Tuple
from dataclasses import dataclass, asdict
from enum import Enum
import uuid
import os
import sys

class DeploymentStatus(Enum):
    """Deployment status"""
    PENDING = "pending"
    DEPLOYING = "deploying"
    RUNNING = "running"
    FAILED = "failed"
    STOPPED = "stopped"
    SCALING = "scaling"

class ScalingPolicy(Enum):
    """Scaling policies"""
    MANUAL = "manual"
    AUTO = "auto"
    SCHEDULED = "scheduled"
    LOAD_BASED = "load_based"

@dataclass
class DeploymentConfig:
    """Deployment configuration"""
    deployment_id: str
    name: str
    environment: str
    region: str
    instance_type: str
    min_instances: int
    max_instances: int
    desired_instances: int
    scaling_policy: ScalingPolicy
    health_check_path: str
    port: int
    ssl_enabled: bool
    domain: str
    database_config: Dict[str, Any]
    monitoring_enabled: bool
    backup_enabled: bool
    auto_scaling_enabled: bool
    created_at: datetime
    updated_at: datetime

@dataclass
class DeploymentMetrics:
    """Deployment performance metrics"""
    deployment_id: str
    cpu_usage: float
    memory_usage: float
    disk_usage: float
    network_in: float
    network_out: float
    request_count: int
    error_rate: float
    response_time: float
    uptime_percentage: float
    active_instances: int
    last_updated: datetime

@dataclass
class ScalingEvent:
    """Scaling event record"""
    event_id: str
    deployment_id: str
    scaling_type: str
    old_instances: int
    new_instances: int
    trigger_reason: str
    triggered_at: datetime
    completed_at: Optional[datetime]
    success: bool
    metadata: Dict[str, Any]

class ProductionDeployment:
    """Production deployment and scaling system"""

    def __init__(self, config_path: str = "/home/oib/windsurf/aitbc"):
        self.config_path = Path(config_path)
        self.deployments: Dict[str, DeploymentConfig] = {}
        self.metrics: Dict[str, DeploymentMetrics] = {}
        self.scaling_events: List[ScalingEvent] = []
        self.health_checks: Dict[str, bool] = {}

        # Deployment paths
        self.deployment_dir = self.config_path / "deployments"
        self.config_dir = self.config_path / "config"
        self.logs_dir = self.config_path / "logs"
        self.backups_dir = self.config_path / "backups"

        # Ensure directories exist
        self.config_path.mkdir(parents=True, exist_ok=True)
        self.deployment_dir.mkdir(parents=True, exist_ok=True)
        self.config_dir.mkdir(parents=True, exist_ok=True)
        self.logs_dir.mkdir(parents=True, exist_ok=True)
        self.backups_dir.mkdir(parents=True, exist_ok=True)

        # Scaling thresholds
        self.scaling_thresholds = {
            'cpu_high': 80.0,
            'cpu_low': 20.0,
            'memory_high': 85.0,
            'memory_low': 30.0,
            'error_rate_high': 5.0,
            'response_time_high': 2000.0,  # ms
            'min_uptime': 99.0
        }

    async def create_deployment(self, name: str, environment: str, region: str,
                              instance_type: str, min_instances: int, max_instances: int,
                              desired_instances: int, port: int, domain: str,
                              database_config: Dict[str, Any]) -> Optional[str]:
        """Create a new deployment configuration"""
        try:
            deployment_id = str(uuid.uuid4())

            deployment = DeploymentConfig(
                deployment_id=deployment_id,
                name=name,
                environment=environment,
                region=region,
                instance_type=instance_type,
                min_instances=min_instances,
                max_instances=max_instances,
                desired_instances=desired_instances,
                scaling_policy=ScalingPolicy.AUTO,
                health_check_path="/health",
                port=port,
                ssl_enabled=True,
                domain=domain,
                database_config=database_config,
                monitoring_enabled=True,
                backup_enabled=True,
                auto_scaling_enabled=True,
                created_at=datetime.now(),
                updated_at=datetime.now()
            )

            self.deployments[deployment_id] = deployment

            # Create deployment directory structure
            deployment_path = self.deployment_dir / deployment_id
            deployment_path.mkdir(exist_ok=True)

            # Generate deployment configuration files
            await self._generate_deployment_configs(deployment, deployment_path)

            return deployment_id

        except Exception as e:
            print(f"Error creating deployment: {e}")
            return None

    async def deploy_application(self, deployment_id: str) -> bool:
        """Deploy the application to production"""
        try:
            deployment = self.deployments.get(deployment_id)
            if not deployment:
                return False

            print(f"Starting deployment of {deployment.name} ({deployment_id})")

            # 1. Build application
            build_success = await self._build_application(deployment)
            if not build_success:
                return False

            # 2. Deploy infrastructure
            infra_success = await self._deploy_infrastructure(deployment)
            if not infra_success:
                return False

            # 3. Configure monitoring
            monitoring_success = await self._setup_monitoring(deployment)
            if not monitoring_success:
                return False

            # 4. Start health checks
            await self._start_health_checks(deployment)

            # 5. Initialize metrics collection
            await self._initialize_metrics(deployment_id)

            print(f"Deployment {deployment_id} completed successfully")
            return True

        except Exception as e:
            print(f"Error deploying application: {e}")
            return False

    async def scale_deployment(self, deployment_id: str, target_instances: int,
                             reason: str = "manual") -> bool:
        """Scale a deployment to target instance count"""
        try:
            deployment = self.deployments.get(deployment_id)
            if not deployment:
                return False

            # Validate scaling limits
            if target_instances < deployment.min_instances or target_instances > deployment.max_instances:
                return False

            old_instances = deployment.desired_instances

            # Create scaling event
            scaling_event = ScalingEvent(
                event_id=str(uuid.uuid4()),
                deployment_id=deployment_id,
                scaling_type="manual" if reason == "manual" else "auto",
                old_instances=old_instances,
                new_instances=target_instances,
                trigger_reason=reason,
                triggered_at=datetime.now(),
                completed_at=None,
                success=False,
                metadata={"deployment_name": deployment.name}
            )

            self.scaling_events.append(scaling_event)

            # Update deployment
            deployment.desired_instances = target_instances
            deployment.updated_at = datetime.now()

            # Execute scaling
            scaling_success = await self._execute_scaling(deployment, target_instances)

            # Update scaling event
            scaling_event.completed_at = datetime.now()
            scaling_event.success = scaling_success

            if scaling_success:
                print(f"Scaled deployment {deployment_id} from {old_instances} to {target_instances} instances")
            else:
                # Rollback on failure
                deployment.desired_instances = old_instances
                print(f"Scaling failed, rolled back to {old_instances} instances")

            return scaling_success

        except Exception as e:
            print(f"Error scaling deployment: {e}")
            return False

    async def auto_scale_deployment(self, deployment_id: str) -> bool:
        """Automatically scale deployment based on metrics"""
        try:
            deployment = self.deployments.get(deployment_id)
            if not deployment or not deployment.auto_scaling_enabled:
                return False

            metrics = self.metrics.get(deployment_id)
            if not metrics:
                return False

            current_instances = deployment.desired_instances
            new_instances = current_instances

            # Scale up conditions
            scale_up_triggers = []
            if metrics.cpu_usage > self.scaling_thresholds['cpu_high']:
                scale_up_triggers.append(f"CPU usage high: {metrics.cpu_usage:.1f}%")

            if metrics.memory_usage > self.scaling_thresholds['memory_high']:
                scale_up_triggers.append(f"Memory usage high: {metrics.memory_usage:.1f}%")

            if metrics.error_rate > self.scaling_thresholds['error_rate_high']:
                scale_up_triggers.append(f"Error rate high: {metrics.error_rate:.1f}%")

            # Scale down conditions
            scale_down_triggers = []
            if (metrics.cpu_usage < self.scaling_thresholds['cpu_low'] and
               metrics.memory_usage < self.scaling_thresholds['memory_low'] and
               current_instances > deployment.min_instances):
                scale_down_triggers.append("Low resource usage")

            # Execute scaling
            if scale_up_triggers and current_instances < deployment.max_instances:
                new_instances = min(current_instances + 1, deployment.max_instances)
                reason = f"Auto scale up: {', '.join(scale_up_triggers)}"
                return await self.scale_deployment(deployment_id, new_instances, reason)

            elif scale_down_triggers and current_instances > deployment.min_instances:
                new_instances = max(current_instances - 1, deployment.min_instances)
                reason = f"Auto scale down: {', '.join(scale_down_triggers)}"
                return await self.scale_deployment(deployment_id, new_instances, reason)

            return True

        except Exception as e:
            print(f"Error in auto-scaling: {e}")
            return False

    async def get_deployment_status(self, deployment_id: str) -> Optional[Dict[str, Any]]:
        """Get comprehensive deployment status"""
        try:
            deployment = self.deployments.get(deployment_id)
            if not deployment:
                return None

            metrics = self.metrics.get(deployment_id)
            health_status = self.health_checks.get(deployment_id, False)

            # Get recent scaling events
            recent_events = [
                event for event in self.scaling_events
                if event.deployment_id == deployment_id and
                event.triggered_at >= datetime.now() - timedelta(hours=24)
            ]

            status = {
                "deployment": asdict(deployment),
                "metrics": asdict(metrics) if metrics else None,
                "health_status": health_status,
                "recent_scaling_events": [asdict(event) for event in recent_events[-5:]],
                "uptime_percentage": metrics.uptime_percentage if metrics else 0.0,
                "last_updated": datetime.now().isoformat()
            }

            return status

        except Exception as e:
            print(f"Error getting deployment status: {e}")
            return None

    async def get_cluster_overview(self) -> Dict[str, Any]:
        """Get overview of all deployments"""
        try:
            total_deployments = len(self.deployments)
            running_deployments = len([
                d for d in self.deployments.values()
                if self.health_checks.get(d.deployment_id, False)
            ])

            total_instances = sum(d.desired_instances for d in self.deployments.values())

            # Calculate aggregate metrics
            aggregate_metrics = {
                "total_cpu_usage": 0.0,
                "total_memory_usage": 0.0,
                "total_disk_usage": 0.0,
                "average_response_time": 0.0,
                "average_error_rate": 0.0,
                "average_uptime": 0.0
            }

            active_metrics = [m for m in self.metrics.values()]
            if active_metrics:
                aggregate_metrics["total_cpu_usage"] = sum(m.cpu_usage for m in active_metrics) / len(active_metrics)
                aggregate_metrics["total_memory_usage"] = sum(m.memory_usage for m in active_metrics) / len(active_metrics)
                aggregate_metrics["total_disk_usage"] = sum(m.disk_usage for m in active_metrics) / len(active_metrics)
                aggregate_metrics["average_response_time"] = sum(m.response_time for m in active_metrics) / len(active_metrics)
                aggregate_metrics["average_error_rate"] = sum(m.error_rate for m in active_metrics) / len(active_metrics)
                aggregate_metrics["average_uptime"] = sum(m.uptime_percentage for m in active_metrics) / len(active_metrics)

            # Recent scaling activity
            recent_scaling = [
                event for event in self.scaling_events
                if event.triggered_at >= datetime.now() - timedelta(hours=24)
            ]

            overview = {
                "total_deployments": total_deployments,
                "running_deployments": running_deployments,
                "total_instances": total_instances,
                "aggregate_metrics": aggregate_metrics,
                "recent_scaling_events": len(recent_scaling),
                "successful_scaling_rate": sum(1 for e in recent_scaling if e.success) / len(recent_scaling) if recent_scaling else 0.0,
                "health_check_coverage": len(self.health_checks) / total_deployments if total_deployments > 0 else 0.0,
                "last_updated": datetime.now().isoformat()
            }

            return overview

        except Exception as e:
            print(f"Error getting cluster overview: {e}")
            return {}

    async def _generate_deployment_configs(self, deployment: DeploymentConfig, deployment_path: Path):
        """Generate deployment configuration files"""
        try:
            # Generate systemd service file
            service_content = f"""[Unit]
Description={deployment.name} Service
After=network.target

[Service]
Type=simple
User=aitbc
WorkingDirectory={self.config_path}
ExecStart=/usr/bin/python3 -m aitbc_cli.main --port {deployment.port}
Restart=always
RestartSec=10
Environment=PYTHONPATH={self.config_path}
Environment=DEPLOYMENT_ID={deployment.deployment_id}
Environment=ENVIRONMENT={deployment.environment}

[Install]
WantedBy=multi-user.target
"""

            service_file = deployment_path / f"{deployment.name}.service"
            with open(service_file, 'w') as f:
                f.write(service_content)

            # Generate nginx configuration
            nginx_content = f"""upstream {deployment.name}_backend {{
    server 127.0.0.1:{deployment.port};
}}

server {{
    listen 80;
    server_name {deployment.domain};

    location / {{
        proxy_pass http://{deployment.name}_backend;
        proxy_set_header Host $host;
        proxy_set_header X-Real-IP $remote_addr;
        proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
        proxy_set_header X-Forwarded-Proto $scheme;
    }}

    location {deployment.health_check_path} {{
        proxy_pass http://{deployment.name}_backend;
        access_log off;
    }}
}}
"""

            nginx_file = deployment_path / f"{deployment.name}.nginx.conf"
            with open(nginx_file, 'w') as f:
                f.write(nginx_content)

            # Generate monitoring configuration
            monitoring_content = f"""# Monitoring configuration for {deployment.name}
deployment_id: {deployment.deployment_id}
name: {deployment.name}
environment: {deployment.environment}
port: {deployment.port}
health_check_path: {deployment.health_check_path}
metrics_interval: 30
alert_thresholds:
  cpu_usage: {self.scaling_thresholds['cpu_high']}
  memory_usage: {self.scaling_thresholds['memory_high']}
  error_rate: {self.scaling_thresholds['error_rate_high']}
  response_time: {self.scaling_thresholds['response_time_high']}
"""

            monitoring_file = deployment_path / "monitoring.yml"
            with open(monitoring_file, 'w') as f:
                f.write(monitoring_content)

        except Exception as e:
            print(f"Error generating deployment configs: {e}")

    async def _build_application(self, deployment: DeploymentConfig) -> bool:
        """Build the application for deployment"""
        try:
            print(f"Building application for {deployment.name}")

            # Simulate build process
            build_steps = [
                "Installing dependencies...",
                "Compiling application...",
                "Running tests...",
                "Creating deployment package...",
                "Optimizing for production..."
            ]

            for step in build_steps:
                print(f"  {step}")
                await asyncio.sleep(0.5)  # Simulate build time

            print("Build completed successfully")
            return True

        except Exception as e:
            print(f"Error building application: {e}")
            return False

    async def _deploy_infrastructure(self, deployment: DeploymentConfig) -> bool:
        """Deploy infrastructure components"""
        try:
            print(f"Deploying infrastructure for {deployment.name}")

            # Deploy systemd service
            service_file = self.deployment_dir / deployment.deployment_id / f"{deployment.name}.service"
            system_service_path = Path("/etc/systemd/system") / f"{deployment.name}.service"

            if service_file.exists():
                shutil.copy2(service_file, system_service_path)
                subprocess.run(["systemctl", "daemon-reload"], check=True)
                subprocess.run(["systemctl", "enable", deployment.name], check=True)
                subprocess.run(["systemctl", "start", deployment.name], check=True)
                print(f"  Service {deployment.name} started")

            # Deploy nginx configuration
            nginx_file = self.deployment_dir / deployment.deployment_id / f"{deployment.name}.nginx.conf"
            nginx_config_path = Path("/etc/nginx/sites-available") / f"{deployment.name}.conf"

            if nginx_file.exists():
                shutil.copy2(nginx_file, nginx_config_path)

                # Enable site
                sites_enabled = Path("/etc/nginx/sites-enabled")
                site_link = sites_enabled / f"{deployment.name}.conf"
                if not site_link.exists():
                    site_link.symlink_to(nginx_config_path)

                subprocess.run(["nginx", "-t"], check=True)
                subprocess.run(["systemctl", "reload", "nginx"], check=True)
                print(f"  Nginx configuration updated")

            print("Infrastructure deployment completed")
            return True

        except Exception as e:
            print(f"Error deploying infrastructure: {e}")
            return False

    async def _setup_monitoring(self, deployment: DeploymentConfig) -> bool:
        """Set up monitoring for the deployment"""
        try:
            print(f"Setting up monitoring for {deployment.name}")

            monitoring_file = self.deployment_dir / deployment.deployment_id / "monitoring.yml"
            if monitoring_file.exists():
                print(f"  Monitoring configuration loaded")
                print(f"  Health checks enabled on {deployment.health_check_path}")
                print(f"  Metrics collection started")

            print("Monitoring setup completed")
            return True

        except Exception as e:
            print(f"Error setting up monitoring: {e}")
            return False

    async def _start_health_checks(self, deployment: DeploymentConfig):
        """Start health checks for the deployment"""
        try:
            print(f"Starting health checks for {deployment.name}")

            # Initialize health status
            self.health_checks[deployment.deployment_id] = True

            # Start periodic health checks
            asyncio.create_task(self._periodic_health_check(deployment))

        except Exception as e:
            print(f"Error starting health checks: {e}")

    async def _periodic_health_check(self, deployment: DeploymentConfig):
        """Periodic health check for deployment"""
        while True:
            try:
                # Simulate health check
                await asyncio.sleep(30)  # Check every 30 seconds

                # Update health status (simulated)
                self.health_checks[deployment.deployment_id] = True

                # Update metrics
                await self._update_metrics(deployment.deployment_id)

            except Exception as e:
                print(f"Error in health check for {deployment.name}: {e}")
                self.health_checks[deployment.deployment_id] = False

    async def _initialize_metrics(self, deployment_id: str):
        """Initialize metrics collection for deployment"""
        try:
            metrics = DeploymentMetrics(
                deployment_id=deployment_id,
                cpu_usage=0.0,
                memory_usage=0.0,
                disk_usage=0.0,
                network_in=0.0,
                network_out=0.0,
                request_count=0,
                error_rate=0.0,
                response_time=0.0,
                uptime_percentage=100.0,
                active_instances=1,
                last_updated=datetime.now()
            )

            self.metrics[deployment_id] = metrics

        except Exception as e:
            print(f"Error initializing metrics: {e}")

    async def _update_metrics(self, deployment_id: str):
        """Update deployment metrics"""
        try:
            metrics = self.metrics.get(deployment_id)
            if not metrics:
                return

            # Simulate metric updates (in production, these would be real metrics)
            import random

            metrics.cpu_usage = random.uniform(10, 70)
            metrics.memory_usage = random.uniform(20, 80)
            metrics.disk_usage = random.uniform(30, 60)
            metrics.network_in = random.uniform(100, 1000)
            metrics.network_out = random.uniform(50, 500)
            metrics.request_count += random.randint(10, 100)
            metrics.error_rate = random.uniform(0, 2)
            metrics.response_time = random.uniform(50, 500)
            metrics.uptime_percentage = random.uniform(99.0, 100.0)
            metrics.last_updated = datetime.now()

        except Exception as e:
            print(f"Error updating metrics: {e}")

    async def _execute_scaling(self, deployment: DeploymentConfig, target_instances: int) -> bool:
        """Execute scaling operation"""
        try:
            print(f"Executing scaling to {target_instances} instances")

            # Simulate scaling process
            scaling_steps = [
                f"Provisioning {target_instances - deployment.desired_instances} new instances...",
                "Configuring new instances...",
                "Load balancing configuration...",
                "Health checks on new instances...",
                "Traffic migration..."
            ]

            for step in scaling_steps:
                print(f"  {step}")
                await asyncio.sleep(1)  # Simulate scaling time

            print("Scaling completed successfully")
            return True

        except Exception as e:
            print(f"Error executing scaling: {e}")
            return False