Remove legacy folder and rewire imports

- Move marketplace_gpu_optimizer.py from legacy to parallel_processing - Update coordinator-api imports to use new dev/gpu_acceleration location - Remove legacy folder as code has been refactored - Fix marketplace_performance.py imports for gpu_acceleration
2026-04-16 22:54:41 +02:00
parent 2246f92cd7
commit ca2a9573f7
5 changed files with 6 additions and 1422 deletions
--- a/apps/coordinator-api/src/app/routers/marketplace_performance.py
+++ b/apps/coordinator-api/src/app/routers/marketplace_performance.py
@@ -16,16 +16,16 @@ logger = logging.getLogger(__name__)
 import os
 import sys
-sys.path.append(os.path.join(os.path.dirname(__file__), "../../../../../gpu_acceleration"))
+sys.path.append(os.path.join(os.path.dirname(__file__), "../../../../../dev/gpu_acceleration"))
-from marketplace_gpu_optimizer import MarketplaceGPUOptimizer
+from parallel_processing.marketplace_gpu_optimizer import MarketplaceGPUOptimizer
-from aitbc.gpu_acceleration.parallel_processing.distributed_framework import (
+from dev.gpu_acceleration.parallel_processing.distributed_framework import (
    DistributedProcessingCoordinator,
    DistributedTask,
 )
-from aitbc.gpu_acceleration.parallel_processing.marketplace_cache_optimizer import MarketplaceDataOptimizer
+from dev.gpu_acceleration.parallel_processing.marketplace_cache_optimizer import MarketplaceDataOptimizer
-from aitbc.gpu_acceleration.parallel_processing.marketplace_monitor import monitor as marketplace_monitor
+from dev.gpu_acceleration.parallel_processing.marketplace_monitor import monitor as marketplace_monitor
-from aitbc.gpu_acceleration.parallel_processing.marketplace_scaler import ResourceScaler
+from dev.gpu_acceleration.parallel_processing.marketplace_scaler import ResourceScaler
 router = APIRouter(prefix="/v1/marketplace/performance", tags=["marketplace-performance"])
--- a/dev/gpu_acceleration/legacy/fastapi_cuda_zk_api.py
+++ b/dev/gpu_acceleration/legacy/fastapi_cuda_zk_api.py
@@ -1,354 +0,0 @@
 #!/usr/bin/env python3
 """
 FastAPI Integration for Production CUDA ZK Accelerator
 Provides REST API endpoints for GPU-accelerated ZK circuit operations
 """
 from fastapi import FastAPI, HTTPException, BackgroundTasks
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel, Field
 from typing import Dict, List, Optional, Any
 import asyncio
 import logging
 import time
 import os
 import sys
 # Add GPU acceleration path
 sys.path.append('/home/oib/windsurf/aitbc/gpu_acceleration')
 try:
    from production_cuda_zk_api import ProductionCUDAZKAPI, ZKOperationRequest, ZKOperationResult
    CUDA_AVAILABLE = True
 except ImportError as e:
    CUDA_AVAILABLE = False
    print(f"⚠️  CUDA API import failed: {e}")
 # Configure logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger("CUDA_ZK_FASTAPI")
 # Initialize FastAPI app
 app = FastAPI(
    title="AITBC CUDA ZK Acceleration API",
    description="Production-ready GPU acceleration for zero-knowledge circuit operations",
    version="1.0.0",
    docs_url="/docs",
    redoc_url="/redoc"
 )
 # Add CORS middleware
 app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
 )
 # Initialize CUDA API
 cuda_api = ProductionCUDAZKAPI()
 # Pydantic models for API
 class FieldAdditionRequest(BaseModel):
    num_elements: int = Field(..., ge=1, le=10000000, description="Number of field elements")
    modulus: Optional[List[int]] = Field(default=[0xFFFFFFFFFFFFFFFF] * 4, description="Field modulus")
    optimization_level: str = Field(default="high", pattern="^(low|medium|high)$")
    use_gpu: bool = Field(default=True, description="Use GPU acceleration")
 class ConstraintVerificationRequest(BaseModel):
    num_constraints: int = Field(..., ge=1, le=10000000, description="Number of constraints")
    constraints: Optional[List[Dict[str, Any]]] = Field(default=None, description="Constraint data")
    optimization_level: str = Field(default="high", pattern="^(low|medium|high)$")
    use_gpu: bool = Field(default=True, description="Use GPU acceleration")
 class WitnessGenerationRequest(BaseModel):
    num_inputs: int = Field(..., ge=1, le=1000000, description="Number of inputs")
    witness_size: int = Field(..., ge=1, le=10000000, description="Witness size")
    optimization_level: str = Field(default="high", pattern="^(low|medium|high)$")
    use_gpu: bool = Field(default=True, description="Use GPU acceleration")
 class BenchmarkRequest(BaseModel):
    max_elements: int = Field(default=1000000, ge=1000, le=10000000, description="Maximum elements to benchmark")
 class APIResponse(BaseModel):
    success: bool
    message: str
    data: Optional[Dict[str, Any]] = None
    execution_time: Optional[float] = None
    gpu_used: Optional[bool] = None
    speedup: Optional[float] = None
 # Health check endpoint
@app.get("/health", response_model=Dict[str, Any])
 async def health_check():
    """Health check endpoint"""
    try:
        stats = cuda_api.get_performance_statistics()
        return {
            "status": "healthy",
            "timestamp": time.time(),
            "cuda_available": stats["cuda_available"],
            "cuda_initialized": stats["cuda_initialized"],
            "gpu_device": stats["gpu_device"]
        }
    except Exception as e:
        logger.error(f"Health check failed: {e}")
        raise HTTPException(status_code=500, detail=str(e))
 # Performance statistics endpoint
@app.get("/stats", response_model=Dict[str, Any])
 async def get_performance_stats():
    """Get comprehensive performance statistics"""
    try:
        return cuda_api.get_performance_statistics()
    except Exception as e:
        logger.error(f"Failed to get stats: {e}")
        raise HTTPException(status_code=500, detail=str(e))
 # Field addition endpoint
@app.post("/field-addition", response_model=APIResponse)
 async def field_addition(request: FieldAdditionRequest):
    """Perform GPU-accelerated field addition"""
    start_time = time.time()
    try:
        zk_request = ZKOperationRequest(
            operation_type="field_addition",
            circuit_data={
                "num_elements": request.num_elements,
                "modulus": request.modulus
            },
            optimization_level=request.optimization_level,
            use_gpu=request.use_gpu
        )
        result = await cuda_api.process_zk_operation(zk_request)
        return APIResponse(
            success=result.success,
            message="Field addition completed successfully" if result.success else "Field addition failed",
            data=result.result_data,
            execution_time=result.execution_time,
            gpu_used=result.gpu_used,
            speedup=result.speedup
        )
    except Exception as e:
        logger.error(f"Field addition failed: {e}")
        raise HTTPException(status_code=500, detail=str(e))
 # Constraint verification endpoint
@app.post("/constraint-verification", response_model=APIResponse)
 async def constraint_verification(request: ConstraintVerificationRequest):
    """Perform GPU-accelerated constraint verification"""
    start_time = time.time()
    try:
        zk_request = ZKOperationRequest(
            operation_type="constraint_verification",
            circuit_data={"num_constraints": request.num_constraints},
            constraints=request.constraints,
            optimization_level=request.optimization_level,
            use_gpu=request.use_gpu
        )
        result = await cuda_api.process_zk_operation(zk_request)
        return APIResponse(
            success=result.success,
            message="Constraint verification completed successfully" if result.success else "Constraint verification failed",
            data=result.result_data,
            execution_time=result.execution_time,
            gpu_used=result.gpu_used,
            speedup=result.speedup
        )
    except Exception as e:
        logger.error(f"Constraint verification failed: {e}")
        raise HTTPException(status_code=500, detail=str(e))
 # Witness generation endpoint
@app.post("/witness-generation", response_model=APIResponse)
 async def witness_generation(request: WitnessGenerationRequest):
    """Perform GPU-accelerated witness generation"""
    start_time = time.time()
    try:
        zk_request = ZKOperationRequest(
            operation_type="witness_generation",
            circuit_data={"num_inputs": request.num_inputs},
            witness_data={"num_inputs": request.num_inputs, "witness_size": request.witness_size},
            optimization_level=request.optimization_level,
            use_gpu=request.use_gpu
        )
        result = await cuda_api.process_zk_operation(zk_request)
        return APIResponse(
            success=result.success,
            message="Witness generation completed successfully" if result.success else "Witness generation failed",
            data=result.result_data,
            execution_time=result.execution_time,
            gpu_used=result.gpu_used,
            speedup=result.speedup
        )
    except Exception as e:
        logger.error(f"Witness generation failed: {e}")
        raise HTTPException(status_code=500, detail=str(e))
 # Comprehensive benchmark endpoint
@app.post("/benchmark", response_model=Dict[str, Any])
 async def comprehensive_benchmark(request: BenchmarkRequest, background_tasks: BackgroundTasks):
    """Run comprehensive performance benchmark"""
    try:
        logger.info(f"Starting comprehensive benchmark up to {request.max_elements:,} elements")
        # Run benchmark asynchronously
        results = await cuda_api.benchmark_comprehensive_performance(request.max_elements)
        return {
            "success": True,
            "message": "Comprehensive benchmark completed",
            "data": results,
            "timestamp": time.time()
        }
    except Exception as e:
        logger.error(f"Benchmark failed: {e}")
        raise HTTPException(status_code=500, detail=str(e))
 # Quick benchmark endpoint
@app.get("/quick-benchmark", response_model=Dict[str, Any])
 async def quick_benchmark():
    """Run quick performance benchmark"""
    try:
        logger.info("Running quick benchmark")
        # Test field addition with 100K elements
        field_request = ZKOperationRequest(
            operation_type="field_addition",
            circuit_data={"num_elements": 100000},
            use_gpu=True
        )
        field_result = await cuda_api.process_zk_operation(field_request)
        # Test constraint verification with 50K constraints
        constraint_request = ZKOperationRequest(
            operation_type="constraint_verification",
            circuit_data={"num_constraints": 50000},
            use_gpu=True
        )
        constraint_result = await cuda_api.process_zk_operation(constraint_request)
        return {
            "success": True,
            "message": "Quick benchmark completed",
            "data": {
                "field_addition": {
                    "success": field_result.success,
                    "execution_time": field_result.execution_time,
                    "gpu_used": field_result.gpu_used,
                    "speedup": field_result.speedup,
                    "throughput": field_result.throughput
                },
                "constraint_verification": {
                    "success": constraint_result.success,
                    "execution_time": constraint_result.execution_time,
                    "gpu_used": constraint_result.gpu_used,
                    "speedup": constraint_result.speedup,
                    "throughput": constraint_result.throughput
                }
            },
            "timestamp": time.time()
        }
    except Exception as e:
        logger.error(f"Quick benchmark failed: {e}")
        raise HTTPException(status_code=500, detail=str(e))
 # GPU information endpoint
@app.get("/gpu-info", response_model=Dict[str, Any])
 async def get_gpu_info():
    """Get GPU information and capabilities"""
    try:
        stats = cuda_api.get_performance_statistics()
        return {
            "cuda_available": stats["cuda_available"],
            "cuda_initialized": stats["cuda_initialized"],
            "gpu_device": stats["gpu_device"],
            "total_operations": stats["total_operations"],
            "gpu_operations": stats["gpu_operations"],
            "cpu_operations": stats["cpu_operations"],
            "gpu_usage_rate": stats.get("gpu_usage_rate", 0),
            "average_speedup": stats.get("average_speedup", 0),
            "average_execution_time": stats.get("average_execution_time", 0)
        }
    except Exception as e:
        logger.error(f"Failed to get GPU info: {e}")
        raise HTTPException(status_code=500, detail=str(e))
 # Reset statistics endpoint
@app.post("/reset-stats", response_model=Dict[str, str])
 async def reset_statistics():
    """Reset performance statistics"""
    try:
        # Reset the statistics in the CUDA API
        cuda_api.operation_stats = {
            "total_operations": 0,
            "gpu_operations": 0,
            "cpu_operations": 0,
            "total_time": 0.0,
            "average_speedup": 0.0
        }
        return {"success": True, "message": "Statistics reset successfully"}
    except Exception as e:
        logger.error(f"Failed to reset stats: {e}")
        raise HTTPException(status_code=500, detail=str(e))
 # Root endpoint
@app.get("/", response_model=Dict[str, Any])
 async def root():
    """Root endpoint with API information"""
    return {
        "name": "AITBC CUDA ZK Acceleration API",
        "version": "1.0.0",
        "description": "Production-ready GPU acceleration for zero-knowledge circuit operations",
        "endpoints": {
            "health": "/health",
            "stats": "/stats",
            "gpu_info": "/gpu-info",
            "field_addition": "/field-addition",
            "constraint_verification": "/constraint-verification",
            "witness_generation": "/witness-generation",
            "quick_benchmark": "/quick-benchmark",
            "comprehensive_benchmark": "/benchmark",
            "docs": "/docs",
            "redoc": "/redoc"
        },
        "cuda_available": CUDA_AVAILABLE,
        "timestamp": time.time()
    }
 if __name__ == "__main__":
    import uvicorn
    print("🚀 Starting AITBC CUDA ZK Acceleration API Server")
    print("=" * 50)
    print(f"   CUDA Available: {CUDA_AVAILABLE}")
    print(f"   API Documentation: http://localhost:8001/docs")
    print(f"   ReDoc Documentation: http://localhost:8001/redoc")
    print("=" * 50)
    uvicorn.run(
        "fastapi_cuda_zk_api:app",
        host="0.0.0.0",
        port=8001,
        reload=True,
        log_level="info"
    )
--- a/dev/gpu_acceleration/legacy/high_performance_cuda_accelerator.py
+++ b/dev/gpu_acceleration/legacy/high_performance_cuda_accelerator.py
@@ -1,453 +0,0 @@
 #!/usr/bin/env python3
 """
 High-Performance CUDA ZK Accelerator with Optimized Kernels
 Implements optimized CUDA kernels with memory coalescing, vectorization, and shared memory
 """
 import ctypes
 import numpy as np
 from typing import List, Tuple, Optional
 import os
 import sys
 import time
 # Optimized field element structure for flat array access
 class OptimizedFieldElement(ctypes.Structure):
    _fields_ = [("limbs", ctypes.c_uint64 * 4)]
 class HighPerformanceCUDAZKAccelerator:
    """High-performance Python interface for optimized CUDA ZK operations"""
    def __init__(self, lib_path: str = None):
        """
        Initialize high-performance CUDA accelerator
        Args:
            lib_path: Path to compiled optimized CUDA library (.so file)
        """
        self.lib_path = lib_path or self._find_optimized_cuda_lib()
        self.lib = None
        self.initialized = False
        try:
            self.lib = ctypes.CDLL(self.lib_path)
            self._setup_function_signatures()
            self.initialized = True
            print(f"✅ High-Performance CUDA ZK Accelerator initialized: {self.lib_path}")
        except Exception as e:
            print(f"❌ Failed to initialize CUDA accelerator: {e}")
            self.initialized = False
    def _find_optimized_cuda_lib(self) -> str:
        """Find the compiled optimized CUDA library"""
        possible_paths = [
            "./liboptimized_field_operations.so",
            "./optimized_field_operations.so",
            "../liboptimized_field_operations.so",
            "../../liboptimized_field_operations.so",
            "/usr/local/lib/liboptimized_field_operations.so"
        ]
        for path in possible_paths:
            if os.path.exists(path):
                return path
        raise FileNotFoundError("Optimized CUDA library not found. Please compile optimized_field_operations.cu first.")
    def _setup_function_signatures(self):
        """Setup function signatures for optimized CUDA library functions"""
        if not self.lib:
            return
        # Initialize optimized CUDA device
        self.lib.init_optimized_cuda_device.argtypes = []
        self.lib.init_optimized_cuda_device.restype = ctypes.c_int
        # Optimized field addition with flat arrays
        self.lib.gpu_optimized_field_addition.argtypes = [
            np.ctypeslib.ndpointer(ctypes.c_uint64, flags="C_CONTIGUOUS"),
            np.ctypeslib.ndpointer(ctypes.c_uint64, flags="C_CONTIGUOUS"),
            np.ctypeslib.ndpointer(ctypes.c_uint64, flags="C_CONTIGUOUS"),
            np.ctypeslib.ndpointer(ctypes.c_uint64, flags="C_CONTIGUOUS"),
            ctypes.c_int
        ]
        self.lib.gpu_optimized_field_addition.restype = ctypes.c_int
        # Vectorized field addition
        self.lib.gpu_vectorized_field_addition.argtypes = [
            np.ctypeslib.ndpointer(ctypes.c_uint64, flags="C_CONTIGUOUS"),  # field_vector_t
            np.ctypeslib.ndpointer(ctypes.c_uint64, flags="C_CONTIGUOUS"),
            np.ctypeslib.ndpointer(ctypes.c_uint64, flags="C_CONTIGUOUS"),
            np.ctypeslib.ndpointer(ctypes.c_uint64, flags="C_CONTIGUOUS"),
            ctypes.c_int
        ]
        self.lib.gpu_vectorized_field_addition.restype = ctypes.c_int
        # Shared memory field addition
        self.lib.gpu_shared_memory_field_addition.argtypes = [
            np.ctypeslib.ndpointer(ctypes.c_uint64, flags="C_CONTIGUOUS"),
            np.ctypeslib.ndpointer(ctypes.c_uint64, flags="C_CONTIGUOUS"),
            np.ctypeslib.ndpointer(ctypes.c_uint64, flags="C_CONTIGUOUS"),
            np.ctypeslib.ndpointer(ctypes.c_uint64, flags="C_CONTIGUOUS"),
            ctypes.c_int
        ]
        self.lib.gpu_shared_memory_field_addition.restype = ctypes.c_int
    def init_device(self) -> bool:
        """Initialize optimized CUDA device and check capabilities"""
        if not self.initialized:
            print("❌ CUDA accelerator not initialized")
            return False
        try:
            result = self.lib.init_optimized_cuda_device()
            if result == 0:
                print("✅ Optimized CUDA device initialized successfully")
                return True
            else:
                print(f"❌ CUDA device initialization failed: {result}")
                return False
        except Exception as e:
            print(f"❌ CUDA device initialization error: {e}")
            return False
    def benchmark_optimized_kernels(self, max_elements: int = 10000000) -> dict:
        """
        Benchmark all optimized CUDA kernels and compare performance
        Args:
            max_elements: Maximum number of elements to test
        Returns:
            Comprehensive performance benchmark results
        """
        if not self.initialized:
            return {"error": "CUDA accelerator not initialized"}
        print(f"🚀 High-Performance CUDA Kernel Benchmark (up to {max_elements:,} elements)")
        print("=" * 80)
        # Test different dataset sizes
        test_sizes = [
            1000,      # 1K elements
            10000,     # 10K elements  
            100000,    # 100K elements
            1000000,   # 1M elements
            5000000,   # 5M elements
            10000000,  # 10M elements
        ]
        results = {
            "test_sizes": [],
            "optimized_flat": [],
            "vectorized": [],
            "shared_memory": [],
            "cpu_baseline": [],
            "performance_summary": {}
        }
        for size in test_sizes:
            if size > max_elements:
                break
            print(f"\n📊 Benchmarking {size:,} elements...")
            # Generate test data as flat arrays for optimal memory access
            a_flat, b_flat = self._generate_flat_test_data(size)
            # bn128 field modulus (simplified)
            modulus = [0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF]
            # Benchmark optimized flat array kernel
            flat_result = self._benchmark_optimized_flat_kernel(a_flat, b_flat, modulus, size)
            # Benchmark vectorized kernel
            vec_result = self._benchmark_vectorized_kernel(a_flat, b_flat, modulus, size)
            # Benchmark shared memory kernel
            shared_result = self._benchmark_shared_memory_kernel(a_flat, b_flat, modulus, size)
            # Benchmark CPU baseline
            cpu_result = self._benchmark_cpu_baseline(a_flat, b_flat, modulus, size)
            # Store results
            results["test_sizes"].append(size)
            results["optimized_flat"].append(flat_result)
            results["vectorized"].append(vec_result)
            results["shared_memory"].append(shared_result)
            results["cpu_baseline"].append(cpu_result)
            # Print comparison
            print(f"   Optimized Flat:   {flat_result['time']:.4f}s, {flat_result['throughput']:.0f} elem/s")
            print(f"   Vectorized:       {vec_result['time']:.4f}s, {vec_result['throughput']:.0f} elem/s")
            print(f"   Shared Memory:    {shared_result['time']:.4f}s, {shared_result['throughput']:.0f} elem/s")
            print(f"   CPU Baseline:     {cpu_result['time']:.4f}s, {cpu_result['throughput']:.0f} elem/s")
            # Calculate speedups
            flat_speedup = cpu_result['time'] / flat_result['time'] if flat_result['time'] > 0 else 0
            vec_speedup = cpu_result['time'] / vec_result['time'] if vec_result['time'] > 0 else 0
            shared_speedup = cpu_result['time'] / shared_result['time'] if shared_result['time'] > 0 else 0
            print(f"   Speedups - Flat: {flat_speedup:.2f}x, Vec: {vec_speedup:.2f}x, Shared: {shared_speedup:.2f}x")
        # Calculate performance summary
        results["performance_summary"] = self._calculate_performance_summary(results)
        # Print final summary
        self._print_performance_summary(results["performance_summary"])
        return results
    def _benchmark_optimized_flat_kernel(self, a_flat: np.ndarray, b_flat: np.ndarray, 
                                        modulus: List[int], num_elements: int) -> dict:
        """Benchmark optimized flat array kernel"""
        try:
            result_flat = np.zeros_like(a_flat)
            modulus_array = np.array(modulus, dtype=np.uint64)
            # Multiple runs for consistency
            times = []
            for run in range(3):
                start_time = time.time()
                success = self.lib.gpu_optimized_field_addition(
                    a_flat, b_flat, result_flat, modulus_array, num_elements
                )
                run_time = time.time() - start_time
                if success == 0:  # Success
                    times.append(run_time)
            if not times:
                return {"time": float('inf'), "throughput": 0, "success": False}
            avg_time = sum(times) / len(times)
            throughput = num_elements / avg_time if avg_time > 0 else 0
            return {"time": avg_time, "throughput": throughput, "success": True}
        except Exception as e:
            print(f"   ❌ Optimized flat kernel error: {e}")
            return {"time": float('inf'), "throughput": 0, "success": False}
    def _benchmark_vectorized_kernel(self, a_flat: np.ndarray, b_flat: np.ndarray, 
                                    modulus: List[int], num_elements: int) -> dict:
        """Benchmark vectorized kernel"""
        try:
            # Convert flat arrays to vectorized format (uint4)
            # For simplicity, we'll reuse the flat array kernel as vectorized
            # In practice, would convert to proper vector format
            result_flat = np.zeros_like(a_flat)
            modulus_array = np.array(modulus, dtype=np.uint64)
            times = []
            for run in range(3):
                start_time = time.time()
                success = self.lib.gpu_vectorized_field_addition(
                    a_flat, b_flat, result_flat, modulus_array, num_elements
                )
                run_time = time.time() - start_time
                if success == 0:
                    times.append(run_time)
            if not times:
                return {"time": float('inf'), "throughput": 0, "success": False}
            avg_time = sum(times) / len(times)
            throughput = num_elements / avg_time if avg_time > 0 else 0
            return {"time": avg_time, "throughput": throughput, "success": True}
        except Exception as e:
            print(f"   ❌ Vectorized kernel error: {e}")
            return {"time": float('inf'), "throughput": 0, "success": False}
    def _benchmark_shared_memory_kernel(self, a_flat: np.ndarray, b_flat: np.ndarray, 
                                       modulus: List[int], num_elements: int) -> dict:
        """Benchmark shared memory kernel"""
        try:
            result_flat = np.zeros_like(a_flat)
            modulus_array = np.array(modulus, dtype=np.uint64)
            times = []
            for run in range(3):
                start_time = time.time()
                success = self.lib.gpu_shared_memory_field_addition(
                    a_flat, b_flat, result_flat, modulus_array, num_elements
                )
                run_time = time.time() - start_time
                if success == 0:
                    times.append(run_time)
            if not times:
                return {"time": float('inf'), "throughput": 0, "success": False}
            avg_time = sum(times) / len(times)
            throughput = num_elements / avg_time if avg_time > 0 else 0
            return {"time": avg_time, "throughput": throughput, "success": True}
        except Exception as e:
            print(f"   ❌ Shared memory kernel error: {e}")
            return {"time": float('inf'), "throughput": 0, "success": False}
    def _benchmark_cpu_baseline(self, a_flat: np.ndarray, b_flat: np.ndarray, 
                                modulus: List[int], num_elements: int) -> dict:
        """Benchmark CPU baseline for comparison"""
        try:
            start_time = time.time()
            # Simple CPU field addition
            result_flat = np.zeros_like(a_flat)
            for i in range(num_elements):
                base_idx = i * 4
                for j in range(4):
                    result_flat[base_idx + j] = (a_flat[base_idx + j] + b_flat[base_idx + j]) % modulus[j]
            cpu_time = time.time() - start_time
            throughput = num_elements / cpu_time if cpu_time > 0 else 0
            return {"time": cpu_time, "throughput": throughput, "success": True}
        except Exception as e:
            print(f"   ❌ CPU baseline error: {e}")
            return {"time": float('inf'), "throughput": 0, "success": False}
    def _generate_flat_test_data(self, num_elements: int) -> Tuple[np.ndarray, np.ndarray]:
        """Generate flat array test data for optimal memory access"""
        # Generate flat arrays (num_elements * 4 limbs)
        flat_size = num_elements * 4
        # Use numpy for fast generation
        a_flat = np.random.randint(0, 2**32, size=flat_size, dtype=np.uint64)
        b_flat = np.random.randint(0, 2**32, size=flat_size, dtype=np.uint64)
        return a_flat, b_flat
    def _calculate_performance_summary(self, results: dict) -> dict:
        """Calculate performance summary statistics"""
        summary = {}
        # Find best performing kernel for each size
        best_speedups = []
        best_throughputs = []
        for i, size in enumerate(results["test_sizes"]):
            cpu_time = results["cpu_baseline"][i]["time"]
            # Calculate speedups
            flat_speedup = cpu_time / results["optimized_flat"][i]["time"] if results["optimized_flat"][i]["time"] > 0 else 0
            vec_speedup = cpu_time / results["vectorized"][i]["time"] if results["vectorized"][i]["time"] > 0 else 0
            shared_speedup = cpu_time / results["shared_memory"][i]["time"] if results["shared_memory"][i]["time"] > 0 else 0
            best_speedup = max(flat_speedup, vec_speedup, shared_speedup)
            best_speedups.append(best_speedup)
            # Find best throughput
            best_throughput = max(
                results["optimized_flat"][i]["throughput"],
                results["vectorized"][i]["throughput"],
                results["shared_memory"][i]["throughput"]
            )
            best_throughputs.append(best_throughput)
        if best_speedups:
            summary["best_speedup"] = max(best_speedups)
            summary["average_speedup"] = sum(best_speedups) / len(best_speedups)
            summary["best_speedup_size"] = results["test_sizes"][best_speedups.index(max(best_speedups))]
        if best_throughputs:
            summary["best_throughput"] = max(best_throughputs)
            summary["average_throughput"] = sum(best_throughputs) / len(best_throughputs)
            summary["best_throughput_size"] = results["test_sizes"][best_throughputs.index(max(best_throughputs))]
        return summary
    def _print_performance_summary(self, summary: dict):
        """Print comprehensive performance summary"""
        print(f"\n🎯 High-Performance CUDA Summary:")
        print("=" * 50)
        if "best_speedup" in summary:
            print(f"   Best Speedup: {summary['best_speedup']:.2f}x at {summary.get('best_speedup_size', 'N/A'):,} elements")
            print(f"   Average Speedup: {summary['average_speedup']:.2f}x across all tests")
        if "best_throughput" in summary:
            print(f"   Best Throughput: {summary['best_throughput']:.0f} elements/s at {summary.get('best_throughput_size', 'N/A'):,} elements")
            print(f"   Average Throughput: {summary['average_throughput']:.0f} elements/s")
        # Performance classification
        if summary.get("best_speedup", 0) > 5:
            print("   🚀 Performance: EXCELLENT - Significant GPU acceleration achieved")
        elif summary.get("best_speedup", 0) > 2:
            print("   ✅ Performance: GOOD - Measurable GPU acceleration achieved")
        elif summary.get("best_speedup", 0) > 1:
            print("   ⚠️  Performance: MODERATE - Limited GPU acceleration")
        else:
            print("   ❌ Performance: POOR - No significant GPU acceleration")
    def analyze_memory_bandwidth(self, num_elements: int = 1000000) -> dict:
        """Analyze memory bandwidth performance"""
        print(f"🔍 Analyzing Memory Bandwidth Performance ({num_elements:,} elements)...")
        a_flat, b_flat = self._generate_flat_test_data(num_elements)
        modulus = [0xFFFFFFFFFFFFFFFF] * 4
        # Test different kernels
        flat_result = self._benchmark_optimized_flat_kernel(a_flat, b_flat, modulus, num_elements)
        vec_result = self._benchmark_vectorized_kernel(a_flat, b_flat, modulus, num_elements)
        shared_result = self._benchmark_shared_memory_kernel(a_flat, b_flat, modulus, num_elements)
        # Calculate theoretical bandwidth
        data_size = num_elements * 4 * 8 * 3  # 3 arrays, 4 limbs, 8 bytes
        analysis = {
            "data_size_gb": data_size / (1024**3),
            "flat_bandwidth_gb_s": data_size / (flat_result['time'] * 1024**3) if flat_result['time'] > 0 else 0,
            "vectorized_bandwidth_gb_s": data_size / (vec_result['time'] * 1024**3) if vec_result['time'] > 0 else 0,
            "shared_bandwidth_gb_s": data_size / (shared_result['time'] * 1024**3) if shared_result['time'] > 0 else 0,
        }
        print(f"   Data Size: {analysis['data_size_gb']:.2f} GB")
        print(f"   Flat Kernel: {analysis['flat_bandwidth_gb_s']:.2f} GB/s")
        print(f"   Vectorized Kernel: {analysis['vectorized_bandwidth_gb_s']:.2f} GB/s")
        print(f"   Shared Memory Kernel: {analysis['shared_bandwidth_gb_s']:.2f} GB/s")
        return analysis
 def main():
    """Main function for testing high-performance CUDA acceleration"""
    print("🚀 AITBC High-Performance CUDA ZK Accelerator Test")
    print("=" * 60)
    try:
        # Initialize high-performance accelerator
        accelerator = HighPerformanceCUDAZKAccelerator()
        if not accelerator.initialized:
            print("❌ Failed to initialize CUDA accelerator")
            return
        # Initialize device
        if not accelerator.init_device():
            return
        # Run comprehensive benchmark
        results = accelerator.benchmark_optimized_kernels(10000000)
        # Analyze memory bandwidth
        bandwidth_analysis = accelerator.analyze_memory_bandwidth(1000000)
        print("\n✅ High-Performance CUDA acceleration test completed!")
        if results.get("performance_summary", {}).get("best_speedup", 0) > 1:
            print(f"🚀 Optimization successful: {results['performance_summary']['best_speedup']:.2f}x speedup achieved")
        else:
            print("⚠️  Further optimization needed")
    except Exception as e:
        print(f"❌ Test failed: {e}")
 if __name__ == "__main__":
    main()
--- a/dev/gpu_acceleration/legacy/production_cuda_zk_api.py
+++ b/dev/gpu_acceleration/legacy/production_cuda_zk_api.py
@@ -1,609 +0,0 @@
 #!/usr/bin/env python3
 """
 Production-Ready CUDA ZK Accelerator API
 Integrates optimized CUDA kernels with AITBC ZK workflow and Coordinator API
 """
 import os
 import sys
 import json
 import time
 import logging
 import asyncio
 from typing import Dict, List, Optional, Tuple, Any
 from dataclasses import dataclass, asdict
 from pathlib import Path
 import numpy as np
 # Configure CUDA library paths before importing CUDA modules
 import os
 os.environ['LD_LIBRARY_PATH'] = '/usr/lib/x86_64-linux-gnu:/usr/local/cuda/lib64'
 # Add CUDA accelerator path
 sys.path.append('/home/oib/windsurf/aitbc/gpu_acceleration')
 try:
    from high_performance_cuda_accelerator import HighPerformanceCUDAZKAccelerator
    CUDA_AVAILABLE = True
 except ImportError as e:
    CUDA_AVAILABLE = False
    print(f"⚠️  CUDA accelerator import failed: {e}")
    print("   Falling back to CPU operations")
 # Configure logging
 logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
 )
 logger = logging.getLogger("CUDA_ZK_API")
@dataclass
 class ZKOperationRequest:
    """Request structure for ZK operations"""
    operation_type: str  # 'field_addition', 'constraint_verification', 'witness_generation'
    circuit_data: Dict[str, Any]
    witness_data: Optional[Dict[str, Any]] = None
    constraints: Optional[List[Dict[str, Any]]] = None
    optimization_level: str = "high"  # 'low', 'medium', 'high'
    use_gpu: bool = True
    timeout_seconds: int = 300
@dataclass
 class ZKOperationResult:
    """Result structure for ZK operations"""
    success: bool
    operation_type: str
    execution_time: float
    gpu_used: bool
    speedup: Optional[float] = None
    throughput: Optional[float] = None
    result_data: Optional[Dict[str, Any]] = None
    error_message: Optional[str] = None
    performance_metrics: Optional[Dict[str, Any]] = None
 class ProductionCUDAZKAPI:
    """Production-ready CUDA ZK Accelerator API"""
    def __init__(self):
        """Initialize the production CUDA ZK API"""
        self.cuda_accelerator = None
        self.initialized = False
        self.performance_cache = {}
        self.operation_stats = {
            "total_operations": 0,
            "gpu_operations": 0,
            "cpu_operations": 0,
            "total_time": 0.0,
            "average_speedup": 0.0
        }
        # Initialize CUDA accelerator
        self._initialize_cuda_accelerator()
        logger.info("🚀 Production CUDA ZK API initialized")
        logger.info(f"   CUDA Available: {CUDA_AVAILABLE}")
        logger.info(f"   GPU Accelerator: {'Ready' if self.cuda_accelerator else 'Not Available'}")
    def _initialize_cuda_accelerator(self):
        """Initialize CUDA accelerator if available"""
        if not CUDA_AVAILABLE:
            logger.warning("CUDA not available, using CPU-only operations")
            return
        try:
            self.cuda_accelerator = HighPerformanceCUDAZKAccelerator()
            if self.cuda_accelerator.init_device():
                self.initialized = True
                logger.info("✅ CUDA accelerator initialized successfully")
            else:
                logger.error("❌ Failed to initialize CUDA device")
                self.cuda_accelerator = None
        except Exception as e:
            logger.error(f"❌ CUDA accelerator initialization failed: {e}")
            self.cuda_accelerator = None
    async def process_zk_operation(self, request: ZKOperationRequest) -> ZKOperationResult:
        """
        Process a ZK operation with GPU acceleration
        Args:
            request: ZK operation request
        Returns:
            ZK operation result
        """
        start_time = time.time()
        operation_type = request.operation_type
        logger.info(f"🔄 Processing {operation_type} operation")
        logger.info(f"   GPU Requested: {request.use_gpu}")
        logger.info(f"   Optimization Level: {request.optimization_level}")
        try:
            # Update statistics
            self.operation_stats["total_operations"] += 1
            # Process operation based on type
            if operation_type == "field_addition":
                result = await self._process_field_addition(request)
            elif operation_type == "constraint_verification":
                result = await self._process_constraint_verification(request)
            elif operation_type == "witness_generation":
                result = await self._process_witness_generation(request)
            else:
                result = ZKOperationResult(
                    success=False,
                    operation_type=operation_type,
                    execution_time=time.time() - start_time,
                    gpu_used=False,
                    error_message=f"Unsupported operation type: {operation_type}"
                )
            # Update statistics
            execution_time = time.time() - start_time
            self.operation_stats["total_time"] += execution_time
            if result.gpu_used:
                self.operation_stats["gpu_operations"] += 1
                if result.speedup:
                    self._update_average_speedup(result.speedup)
            else:
                self.operation_stats["cpu_operations"] += 1
            logger.info(f"✅ Operation completed in {execution_time:.4f}s")
            if result.speedup:
                logger.info(f"   Speedup: {result.speedup:.2f}x")
            return result
        except Exception as e:
            logger.error(f"❌ Operation failed: {e}")
            return ZKOperationResult(
                success=False,
                operation_type=operation_type,
                execution_time=time.time() - start_time,
                gpu_used=False,
                error_message=str(e)
            )
    async def _process_field_addition(self, request: ZKOperationRequest) -> ZKOperationResult:
        """Process field addition operation"""
        start_time = time.time()
        # Extract field data from request
        circuit_data = request.circuit_data
        num_elements = circuit_data.get("num_elements", 1000)
        # Generate test data (in production, would use actual circuit data)
        a_flat, b_flat = self._generate_field_data(num_elements)
        modulus = circuit_data.get("modulus", [0xFFFFFFFFFFFFFFFF] * 4)
        gpu_used = False
        speedup = None
        throughput = None
        performance_metrics = None
        if request.use_gpu and self.cuda_accelerator and self.initialized:
            # Use GPU acceleration
            try:
                gpu_result = self.cuda_accelerator._benchmark_optimized_flat_kernel(
                    a_flat, b_flat, modulus, num_elements
                )
                if gpu_result["success"]:
                    gpu_used = True
                    gpu_time = gpu_result["time"]
                    throughput = gpu_result["throughput"]
                    # Compare with CPU baseline
                    cpu_time = self._cpu_field_addition_time(num_elements)
                    speedup = cpu_time / gpu_time if gpu_time > 0 else 0
                    performance_metrics = {
                        "gpu_time": gpu_time,
                        "cpu_time": cpu_time,
                        "memory_bandwidth": self._estimate_memory_bandwidth(num_elements, gpu_time),
                        "gpu_utilization": self._estimate_gpu_utilization(num_elements)
                    }
                    logger.info(f"🚀 GPU field addition completed")
                    logger.info(f"   GPU Time: {gpu_time:.4f}s")
                    logger.info(f"   CPU Time: {cpu_time:.4f}s")
                    logger.info(f"   Speedup: {speedup:.2f}x")
                else:
                    logger.warning("GPU operation failed, falling back to CPU")
            except Exception as e:
                logger.warning(f"GPU operation failed: {e}, falling back to CPU")
        # CPU fallback
        if not gpu_used:
            cpu_time = self._cpu_field_addition_time(num_elements)
            throughput = num_elements / cpu_time if cpu_time > 0 else 0
            performance_metrics = {
                "cpu_time": cpu_time,
                "cpu_throughput": throughput
            }
        execution_time = time.time() - start_time
        return ZKOperationResult(
            success=True,
            operation_type="field_addition",
            execution_time=execution_time,
            gpu_used=gpu_used,
            speedup=speedup,
            throughput=throughput,
            result_data={"num_elements": num_elements},
            performance_metrics=performance_metrics
        )
    async def _process_constraint_verification(self, request: ZKOperationRequest) -> ZKOperationResult:
        """Process constraint verification operation"""
        start_time = time.time()
        # Extract constraint data
        constraints = request.constraints or []
        num_constraints = len(constraints)
        if num_constraints == 0:
            # Generate test constraints
            num_constraints = request.circuit_data.get("num_constraints", 1000)
            constraints = self._generate_test_constraints(num_constraints)
        gpu_used = False
        speedup = None
        throughput = None
        performance_metrics = None
        if request.use_gpu and self.cuda_accelerator and self.initialized:
            try:
                # Use GPU for constraint verification
                gpu_time = self._gpu_constraint_verification_time(num_constraints)
                gpu_used = True
                throughput = num_constraints / gpu_time if gpu_time > 0 else 0
                # Compare with CPU
                cpu_time = self._cpu_constraint_verification_time(num_constraints)
                speedup = cpu_time / gpu_time if gpu_time > 0 else 0
                performance_metrics = {
                    "gpu_time": gpu_time,
                    "cpu_time": cpu_time,
                    "constraints_verified": num_constraints,
                    "verification_rate": throughput
                }
                logger.info(f"🚀 GPU constraint verification completed")
                logger.info(f"   Constraints: {num_constraints}")
                logger.info(f"   Speedup: {speedup:.2f}x")
            except Exception as e:
                logger.warning(f"GPU constraint verification failed: {e}, falling back to CPU")
        # CPU fallback
        if not gpu_used:
            cpu_time = self._cpu_constraint_verification_time(num_constraints)
            throughput = num_constraints / cpu_time if cpu_time > 0 else 0
            performance_metrics = {
                "cpu_time": cpu_time,
                "constraints_verified": num_constraints,
                "verification_rate": throughput
            }
        execution_time = time.time() - start_time
        return ZKOperationResult(
            success=True,
            operation_type="constraint_verification",
            execution_time=execution_time,
            gpu_used=gpu_used,
            speedup=speedup,
            throughput=throughput,
            result_data={"num_constraints": num_constraints},
            performance_metrics=performance_metrics
        )
    async def _process_witness_generation(self, request: ZKOperationRequest) -> ZKOperationResult:
        """Process witness generation operation"""
        start_time = time.time()
        # Extract witness data
        witness_data = request.witness_data or {}
        num_inputs = witness_data.get("num_inputs", 1000)
        witness_size = witness_data.get("witness_size", 10000)
        gpu_used = False
        speedup = None
        throughput = None
        performance_metrics = None
        if request.use_gpu and self.cuda_accelerator and self.initialized:
            try:
                # Use GPU for witness generation
                gpu_time = self._gpu_witness_generation_time(num_inputs, witness_size)
                gpu_used = True
                throughput = witness_size / gpu_time if gpu_time > 0 else 0
                # Compare with CPU
                cpu_time = self._cpu_witness_generation_time(num_inputs, witness_size)
                speedup = cpu_time / gpu_time if gpu_time > 0 else 0
                performance_metrics = {
                    "gpu_time": gpu_time,
                    "cpu_time": cpu_time,
                    "witness_size": witness_size,
                    "generation_rate": throughput
                }
                logger.info(f"🚀 GPU witness generation completed")
                logger.info(f"   Witness Size: {witness_size}")
                logger.info(f"   Speedup: {speedup:.2f}x")
            except Exception as e:
                logger.warning(f"GPU witness generation failed: {e}, falling back to CPU")
        # CPU fallback
        if not gpu_used:
            cpu_time = self._cpu_witness_generation_time(num_inputs, witness_size)
            throughput = witness_size / cpu_time if cpu_time > 0 else 0
            performance_metrics = {
                "cpu_time": cpu_time,
                "witness_size": witness_size,
                "generation_rate": throughput
            }
        execution_time = time.time() - start_time
        return ZKOperationResult(
            success=True,
            operation_type="witness_generation",
            execution_time=execution_time,
            gpu_used=gpu_used,
            speedup=speedup,
            throughput=throughput,
            result_data={"witness_size": witness_size},
            performance_metrics=performance_metrics
        )
    def _generate_field_data(self, num_elements: int) -> Tuple[np.ndarray, np.ndarray]:
        """Generate field test data"""
        flat_size = num_elements * 4
        a_flat = np.random.randint(0, 2**32, size=flat_size, dtype=np.uint64)
        b_flat = np.random.randint(0, 2**32, size=flat_size, dtype=np.uint64)
        return a_flat, b_flat
    def _generate_test_constraints(self, num_constraints: int) -> List[Dict[str, Any]]:
        """Generate test constraints"""
        constraints = []
        for i in range(num_constraints):
            constraint = {
                "a": [np.random.randint(0, 2**32) for _ in range(4)],
                "b": [np.random.randint(0, 2**32) for _ in range(4)],
                "c": [np.random.randint(0, 2**32) for _ in range(4)],
                "operation": np.random.choice([0, 1])
            }
            constraints.append(constraint)
        return constraints
    def _cpu_field_addition_time(self, num_elements: int) -> float:
        """Estimate CPU field addition time"""
        # Based on benchmark: ~725K elements/s for CPU
        return num_elements / 725000
    def _gpu_field_addition_time(self, num_elements: int) -> float:
        """Estimate GPU field addition time"""
        # Based on benchmark: ~120M elements/s for GPU
        return num_elements / 120000000
    def _cpu_constraint_verification_time(self, num_constraints: int) -> float:
        """Estimate CPU constraint verification time"""
        # Based on benchmark: ~500K constraints/s for CPU
        return num_constraints / 500000
    def _gpu_constraint_verification_time(self, num_constraints: int) -> float:
        """Estimate GPU constraint verification time"""
        # Based on benchmark: ~100M constraints/s for GPU
        return num_constraints / 100000000
    def _cpu_witness_generation_time(self, num_inputs: int, witness_size: int) -> float:
        """Estimate CPU witness generation time"""
        # Based on benchmark: ~1M witness elements/s for CPU
        return witness_size / 1000000
    def _gpu_witness_generation_time(self, num_inputs: int, witness_size: int) -> float:
        """Estimate GPU witness generation time"""
        # Based on benchmark: ~50M witness elements/s for GPU
        return witness_size / 50000000
    def _estimate_memory_bandwidth(self, num_elements: int, gpu_time: float) -> float:
        """Estimate memory bandwidth in GB/s"""
        # 3 arrays * 4 limbs * 8 bytes * num_elements
        data_size_gb = (3 * 4 * 8 * num_elements) / (1024**3)
        return data_size_gb / gpu_time if gpu_time > 0 else 0
    def _estimate_gpu_utilization(self, num_elements: int) -> float:
        """Estimate GPU utilization percentage"""
        # Based on thread count and GPU capacity
        if num_elements < 1000:
            return 20.0  # Low utilization for small workloads
        elif num_elements < 10000:
            return 60.0  # Medium utilization
        elif num_elements < 100000:
            return 85.0  # High utilization
        else:
            return 95.0  # Very high utilization for large workloads
    def _update_average_speedup(self, new_speedup: float):
        """Update running average speedup"""
        total_ops = self.operation_stats["gpu_operations"]
        if total_ops == 1:
            self.operation_stats["average_speedup"] = new_speedup
        else:
            current_avg = self.operation_stats["average_speedup"]
            self.operation_stats["average_speedup"] = (
                (current_avg * (total_ops - 1) + new_speedup) / total_ops
            )
    def get_performance_statistics(self) -> Dict[str, Any]:
        """Get comprehensive performance statistics"""
        stats = self.operation_stats.copy()
        if stats["total_operations"] > 0:
            stats["average_execution_time"] = stats["total_time"] / stats["total_operations"]
            stats["gpu_usage_rate"] = stats["gpu_operations"] / stats["total_operations"] * 100
            stats["cpu_usage_rate"] = stats["cpu_operations"] / stats["total_operations"] * 100
        else:
            stats["average_execution_time"] = 0
            stats["gpu_usage_rate"] = 0
            stats["cpu_usage_rate"] = 0
        stats["cuda_available"] = CUDA_AVAILABLE
        stats["cuda_initialized"] = self.initialized
        stats["gpu_device"] = "NVIDIA GeForce RTX 4060 Ti" if self.cuda_accelerator else "N/A"
        return stats
    async def benchmark_comprehensive_performance(self, max_elements: int = 1000000) -> Dict[str, Any]:
        """Run comprehensive performance benchmark"""
        logger.info(f"🚀 Running comprehensive performance benchmark up to {max_elements:,} elements")
        benchmark_results = {
            "field_addition": [],
            "constraint_verification": [],
            "witness_generation": [],
            "summary": {}
        }
        test_sizes = [1000, 10000, 100000, max_elements]
        for size in test_sizes:
            logger.info(f"📊 Benchmarking {size:,} elements...")
            # Field addition benchmark
            field_request = ZKOperationRequest(
                operation_type="field_addition",
                circuit_data={"num_elements": size},
                use_gpu=True
            )
            field_result = await self.process_zk_operation(field_request)
            benchmark_results["field_addition"].append({
                "size": size,
                "result": asdict(field_result)
            })
            # Constraint verification benchmark
            constraint_request = ZKOperationRequest(
                operation_type="constraint_verification",
                circuit_data={"num_constraints": size},
                use_gpu=True
            )
            constraint_result = await self.process_zk_operation(constraint_request)
            benchmark_results["constraint_verification"].append({
                "size": size,
                "result": asdict(constraint_result)
            })
            # Witness generation benchmark
            witness_request = ZKOperationRequest(
                operation_type="witness_generation",
                circuit_data={"num_inputs": size // 10},  # Add required circuit_data
                witness_data={"num_inputs": size // 10, "witness_size": size},
                use_gpu=True
            )
            witness_result = await self.process_zk_operation(witness_request)
            benchmark_results["witness_generation"].append({
                "size": size,
                "result": asdict(witness_result)
            })
        # Calculate summary statistics
        benchmark_results["summary"] = self._calculate_benchmark_summary(benchmark_results)
        logger.info("✅ Comprehensive benchmark completed")
        return benchmark_results
    def _calculate_benchmark_summary(self, results: Dict[str, Any]) -> Dict[str, Any]:
        """Calculate benchmark summary statistics"""
        summary = {}
        for operation_type in ["field_addition", "constraint_verification", "witness_generation"]:
            operation_results = results[operation_type]
            speedups = [r["result"]["speedup"] for r in operation_results if r["result"]["speedup"]]
            throughputs = [r["result"]["throughput"] for r in operation_results if r["result"]["throughput"]]
            if speedups:
                summary[f"{operation_type}_avg_speedup"] = sum(speedups) / len(speedups)
                summary[f"{operation_type}_max_speedup"] = max(speedups)
            if throughputs:
                summary[f"{operation_type}_avg_throughput"] = sum(throughputs) / len(throughputs)
                summary[f"{operation_type}_max_throughput"] = max(throughputs)
        return summary
 # Global API instance
 cuda_zk_api = ProductionCUDAZKAPI()
 async def main():
    """Main function for testing the production API"""
    print("🚀 AITBC Production CUDA ZK API Test")
    print("=" * 50)
    try:
        # Test field addition
        print("\n📊 Testing Field Addition...")
        field_request = ZKOperationRequest(
            operation_type="field_addition",
            circuit_data={"num_elements": 100000},
            use_gpu=True
        )
        field_result = await cuda_zk_api.process_zk_operation(field_request)
        print(f"   Result: {field_result.success}")
        print(f"   GPU Used: {field_result.gpu_used}")
        print(f"   Speedup: {field_result.speedup:.2f}x" if field_result.speedup else "   Speedup: N/A")
        # Test constraint verification
        print("\n📊 Testing Constraint Verification...")
        constraint_request = ZKOperationRequest(
            operation_type="constraint_verification",
            circuit_data={"num_constraints": 50000},
            use_gpu=True
        )
        constraint_result = await cuda_zk_api.process_zk_operation(constraint_request)
        print(f"   Result: {constraint_result.success}")
        print(f"   GPU Used: {constraint_result.gpu_used}")
        print(f"   Speedup: {constraint_result.speedup:.2f}x" if constraint_result.speedup else "   Speedup: N/A")
        # Test witness generation
        print("\n📊 Testing Witness Generation...")
        witness_request = ZKOperationRequest(
            operation_type="witness_generation",
            circuit_data={"num_inputs": 1000},  # Add required circuit_data
            witness_data={"num_inputs": 1000, "witness_size": 50000},
            use_gpu=True
        )
        witness_result = await cuda_zk_api.process_zk_operation(witness_request)
        print(f"   Result: {witness_result.success}")
        print(f"   GPU Used: {witness_result.gpu_used}")
        print(f"   Speedup: {witness_result.speedup:.2f}x" if witness_result.speedup else "   Speedup: N/A")
        # Get performance statistics
        print("\n📊 Performance Statistics:")
        stats = cuda_zk_api.get_performance_statistics()
        for key, value in stats.items():
            print(f"   {key}: {value}")
        # Run comprehensive benchmark
        print("\n🚀 Running Comprehensive Benchmark...")
        benchmark_results = await cuda_zk_api.benchmark_comprehensive_performance(100000)
        print("\n✅ Production API test completed successfully!")
    except Exception as e:
        print(f"❌ Test failed: {e}")
 if __name__ == "__main__":
    asyncio.run(main())
--- a/dev/gpu_acceleration/parallel_processing/marketplace_gpu_optimizer.py
+++ b/dev/gpu_acceleration/parallel_processing/marketplace_gpu_optimizer.py