diff --git a/apps/coordinator-api/src/app/routers/marketplace_performance.py b/apps/coordinator-api/src/app/routers/marketplace_performance.py index a7b9aa11..2ccbf11e 100755 --- a/apps/coordinator-api/src/app/routers/marketplace_performance.py +++ b/apps/coordinator-api/src/app/routers/marketplace_performance.py @@ -16,16 +16,16 @@ logger = logging.getLogger(__name__) import os import sys -sys.path.append(os.path.join(os.path.dirname(__file__), "../../../../../gpu_acceleration")) -from marketplace_gpu_optimizer import MarketplaceGPUOptimizer +sys.path.append(os.path.join(os.path.dirname(__file__), "../../../../../dev/gpu_acceleration")) +from parallel_processing.marketplace_gpu_optimizer import MarketplaceGPUOptimizer -from aitbc.gpu_acceleration.parallel_processing.distributed_framework import ( +from dev.gpu_acceleration.parallel_processing.distributed_framework import ( DistributedProcessingCoordinator, DistributedTask, ) -from aitbc.gpu_acceleration.parallel_processing.marketplace_cache_optimizer import MarketplaceDataOptimizer -from aitbc.gpu_acceleration.parallel_processing.marketplace_monitor import monitor as marketplace_monitor -from aitbc.gpu_acceleration.parallel_processing.marketplace_scaler import ResourceScaler +from dev.gpu_acceleration.parallel_processing.marketplace_cache_optimizer import MarketplaceDataOptimizer +from dev.gpu_acceleration.parallel_processing.marketplace_monitor import monitor as marketplace_monitor +from dev.gpu_acceleration.parallel_processing.marketplace_scaler import ResourceScaler router = APIRouter(prefix="/v1/marketplace/performance", tags=["marketplace-performance"]) diff --git a/dev/gpu_acceleration/legacy/fastapi_cuda_zk_api.py b/dev/gpu_acceleration/legacy/fastapi_cuda_zk_api.py deleted file mode 100755 index 28a3c80e..00000000 --- a/dev/gpu_acceleration/legacy/fastapi_cuda_zk_api.py +++ /dev/null @@ -1,354 +0,0 @@ -#!/usr/bin/env python3 -""" -FastAPI Integration for Production CUDA ZK Accelerator -Provides REST API endpoints for GPU-accelerated ZK circuit operations -""" - -from fastapi import FastAPI, HTTPException, BackgroundTasks -from fastapi.middleware.cors import CORSMiddleware -from pydantic import BaseModel, Field -from typing import Dict, List, Optional, Any -import asyncio -import logging -import time -import os -import sys - -# Add GPU acceleration path -sys.path.append('/home/oib/windsurf/aitbc/gpu_acceleration') - -try: - from production_cuda_zk_api import ProductionCUDAZKAPI, ZKOperationRequest, ZKOperationResult - CUDA_AVAILABLE = True -except ImportError as e: - CUDA_AVAILABLE = False - print(f"āš ļø CUDA API import failed: {e}") - -# Configure logging -logging.basicConfig(level=logging.INFO) -logger = logging.getLogger("CUDA_ZK_FASTAPI") - -# Initialize FastAPI app -app = FastAPI( - title="AITBC CUDA ZK Acceleration API", - description="Production-ready GPU acceleration for zero-knowledge circuit operations", - version="1.0.0", - docs_url="/docs", - redoc_url="/redoc" -) - -# Add CORS middleware -app.add_middleware( - CORSMiddleware, - allow_origins=["*"], - allow_credentials=True, - allow_methods=["*"], - allow_headers=["*"], -) - -# Initialize CUDA API -cuda_api = ProductionCUDAZKAPI() - -# Pydantic models for API -class FieldAdditionRequest(BaseModel): - num_elements: int = Field(..., ge=1, le=10000000, description="Number of field elements") - modulus: Optional[List[int]] = Field(default=[0xFFFFFFFFFFFFFFFF] * 4, description="Field modulus") - optimization_level: str = Field(default="high", pattern="^(low|medium|high)$") - use_gpu: bool = Field(default=True, description="Use GPU acceleration") - -class ConstraintVerificationRequest(BaseModel): - num_constraints: int = Field(..., ge=1, le=10000000, description="Number of constraints") - constraints: Optional[List[Dict[str, Any]]] = Field(default=None, description="Constraint data") - optimization_level: str = Field(default="high", pattern="^(low|medium|high)$") - use_gpu: bool = Field(default=True, description="Use GPU acceleration") - -class WitnessGenerationRequest(BaseModel): - num_inputs: int = Field(..., ge=1, le=1000000, description="Number of inputs") - witness_size: int = Field(..., ge=1, le=10000000, description="Witness size") - optimization_level: str = Field(default="high", pattern="^(low|medium|high)$") - use_gpu: bool = Field(default=True, description="Use GPU acceleration") - -class BenchmarkRequest(BaseModel): - max_elements: int = Field(default=1000000, ge=1000, le=10000000, description="Maximum elements to benchmark") - -class APIResponse(BaseModel): - success: bool - message: str - data: Optional[Dict[str, Any]] = None - execution_time: Optional[float] = None - gpu_used: Optional[bool] = None - speedup: Optional[float] = None - -# Health check endpoint -@app.get("/health", response_model=Dict[str, Any]) -async def health_check(): - """Health check endpoint""" - try: - stats = cuda_api.get_performance_statistics() - return { - "status": "healthy", - "timestamp": time.time(), - "cuda_available": stats["cuda_available"], - "cuda_initialized": stats["cuda_initialized"], - "gpu_device": stats["gpu_device"] - } - except Exception as e: - logger.error(f"Health check failed: {e}") - raise HTTPException(status_code=500, detail=str(e)) - -# Performance statistics endpoint -@app.get("/stats", response_model=Dict[str, Any]) -async def get_performance_stats(): - """Get comprehensive performance statistics""" - try: - return cuda_api.get_performance_statistics() - except Exception as e: - logger.error(f"Failed to get stats: {e}") - raise HTTPException(status_code=500, detail=str(e)) - -# Field addition endpoint -@app.post("/field-addition", response_model=APIResponse) -async def field_addition(request: FieldAdditionRequest): - """Perform GPU-accelerated field addition""" - start_time = time.time() - - try: - zk_request = ZKOperationRequest( - operation_type="field_addition", - circuit_data={ - "num_elements": request.num_elements, - "modulus": request.modulus - }, - optimization_level=request.optimization_level, - use_gpu=request.use_gpu - ) - - result = await cuda_api.process_zk_operation(zk_request) - - return APIResponse( - success=result.success, - message="Field addition completed successfully" if result.success else "Field addition failed", - data=result.result_data, - execution_time=result.execution_time, - gpu_used=result.gpu_used, - speedup=result.speedup - ) - - except Exception as e: - logger.error(f"Field addition failed: {e}") - raise HTTPException(status_code=500, detail=str(e)) - -# Constraint verification endpoint -@app.post("/constraint-verification", response_model=APIResponse) -async def constraint_verification(request: ConstraintVerificationRequest): - """Perform GPU-accelerated constraint verification""" - start_time = time.time() - - try: - zk_request = ZKOperationRequest( - operation_type="constraint_verification", - circuit_data={"num_constraints": request.num_constraints}, - constraints=request.constraints, - optimization_level=request.optimization_level, - use_gpu=request.use_gpu - ) - - result = await cuda_api.process_zk_operation(zk_request) - - return APIResponse( - success=result.success, - message="Constraint verification completed successfully" if result.success else "Constraint verification failed", - data=result.result_data, - execution_time=result.execution_time, - gpu_used=result.gpu_used, - speedup=result.speedup - ) - - except Exception as e: - logger.error(f"Constraint verification failed: {e}") - raise HTTPException(status_code=500, detail=str(e)) - -# Witness generation endpoint -@app.post("/witness-generation", response_model=APIResponse) -async def witness_generation(request: WitnessGenerationRequest): - """Perform GPU-accelerated witness generation""" - start_time = time.time() - - try: - zk_request = ZKOperationRequest( - operation_type="witness_generation", - circuit_data={"num_inputs": request.num_inputs}, - witness_data={"num_inputs": request.num_inputs, "witness_size": request.witness_size}, - optimization_level=request.optimization_level, - use_gpu=request.use_gpu - ) - - result = await cuda_api.process_zk_operation(zk_request) - - return APIResponse( - success=result.success, - message="Witness generation completed successfully" if result.success else "Witness generation failed", - data=result.result_data, - execution_time=result.execution_time, - gpu_used=result.gpu_used, - speedup=result.speedup - ) - - except Exception as e: - logger.error(f"Witness generation failed: {e}") - raise HTTPException(status_code=500, detail=str(e)) - -# Comprehensive benchmark endpoint -@app.post("/benchmark", response_model=Dict[str, Any]) -async def comprehensive_benchmark(request: BenchmarkRequest, background_tasks: BackgroundTasks): - """Run comprehensive performance benchmark""" - try: - logger.info(f"Starting comprehensive benchmark up to {request.max_elements:,} elements") - - # Run benchmark asynchronously - results = await cuda_api.benchmark_comprehensive_performance(request.max_elements) - - return { - "success": True, - "message": "Comprehensive benchmark completed", - "data": results, - "timestamp": time.time() - } - - except Exception as e: - logger.error(f"Benchmark failed: {e}") - raise HTTPException(status_code=500, detail=str(e)) - -# Quick benchmark endpoint -@app.get("/quick-benchmark", response_model=Dict[str, Any]) -async def quick_benchmark(): - """Run quick performance benchmark""" - try: - logger.info("Running quick benchmark") - - # Test field addition with 100K elements - field_request = ZKOperationRequest( - operation_type="field_addition", - circuit_data={"num_elements": 100000}, - use_gpu=True - ) - field_result = await cuda_api.process_zk_operation(field_request) - - # Test constraint verification with 50K constraints - constraint_request = ZKOperationRequest( - operation_type="constraint_verification", - circuit_data={"num_constraints": 50000}, - use_gpu=True - ) - constraint_result = await cuda_api.process_zk_operation(constraint_request) - - return { - "success": True, - "message": "Quick benchmark completed", - "data": { - "field_addition": { - "success": field_result.success, - "execution_time": field_result.execution_time, - "gpu_used": field_result.gpu_used, - "speedup": field_result.speedup, - "throughput": field_result.throughput - }, - "constraint_verification": { - "success": constraint_result.success, - "execution_time": constraint_result.execution_time, - "gpu_used": constraint_result.gpu_used, - "speedup": constraint_result.speedup, - "throughput": constraint_result.throughput - } - }, - "timestamp": time.time() - } - - except Exception as e: - logger.error(f"Quick benchmark failed: {e}") - raise HTTPException(status_code=500, detail=str(e)) - -# GPU information endpoint -@app.get("/gpu-info", response_model=Dict[str, Any]) -async def get_gpu_info(): - """Get GPU information and capabilities""" - try: - stats = cuda_api.get_performance_statistics() - - return { - "cuda_available": stats["cuda_available"], - "cuda_initialized": stats["cuda_initialized"], - "gpu_device": stats["gpu_device"], - "total_operations": stats["total_operations"], - "gpu_operations": stats["gpu_operations"], - "cpu_operations": stats["cpu_operations"], - "gpu_usage_rate": stats.get("gpu_usage_rate", 0), - "average_speedup": stats.get("average_speedup", 0), - "average_execution_time": stats.get("average_execution_time", 0) - } - - except Exception as e: - logger.error(f"Failed to get GPU info: {e}") - raise HTTPException(status_code=500, detail=str(e)) - -# Reset statistics endpoint -@app.post("/reset-stats", response_model=Dict[str, str]) -async def reset_statistics(): - """Reset performance statistics""" - try: - # Reset the statistics in the CUDA API - cuda_api.operation_stats = { - "total_operations": 0, - "gpu_operations": 0, - "cpu_operations": 0, - "total_time": 0.0, - "average_speedup": 0.0 - } - - return {"success": True, "message": "Statistics reset successfully"} - - except Exception as e: - logger.error(f"Failed to reset stats: {e}") - raise HTTPException(status_code=500, detail=str(e)) - -# Root endpoint -@app.get("/", response_model=Dict[str, Any]) -async def root(): - """Root endpoint with API information""" - return { - "name": "AITBC CUDA ZK Acceleration API", - "version": "1.0.0", - "description": "Production-ready GPU acceleration for zero-knowledge circuit operations", - "endpoints": { - "health": "/health", - "stats": "/stats", - "gpu_info": "/gpu-info", - "field_addition": "/field-addition", - "constraint_verification": "/constraint-verification", - "witness_generation": "/witness-generation", - "quick_benchmark": "/quick-benchmark", - "comprehensive_benchmark": "/benchmark", - "docs": "/docs", - "redoc": "/redoc" - }, - "cuda_available": CUDA_AVAILABLE, - "timestamp": time.time() - } - -if __name__ == "__main__": - import uvicorn - - print("šŸš€ Starting AITBC CUDA ZK Acceleration API Server") - print("=" * 50) - print(f" CUDA Available: {CUDA_AVAILABLE}") - print(f" API Documentation: http://localhost:8001/docs") - print(f" ReDoc Documentation: http://localhost:8001/redoc") - print("=" * 50) - - uvicorn.run( - "fastapi_cuda_zk_api:app", - host="0.0.0.0", - port=8001, - reload=True, - log_level="info" - ) diff --git a/dev/gpu_acceleration/legacy/high_performance_cuda_accelerator.py b/dev/gpu_acceleration/legacy/high_performance_cuda_accelerator.py deleted file mode 100755 index 4f4ff6f6..00000000 --- a/dev/gpu_acceleration/legacy/high_performance_cuda_accelerator.py +++ /dev/null @@ -1,453 +0,0 @@ -#!/usr/bin/env python3 -""" -High-Performance CUDA ZK Accelerator with Optimized Kernels -Implements optimized CUDA kernels with memory coalescing, vectorization, and shared memory -""" - -import ctypes -import numpy as np -from typing import List, Tuple, Optional -import os -import sys -import time - -# Optimized field element structure for flat array access -class OptimizedFieldElement(ctypes.Structure): - _fields_ = [("limbs", ctypes.c_uint64 * 4)] - -class HighPerformanceCUDAZKAccelerator: - """High-performance Python interface for optimized CUDA ZK operations""" - - def __init__(self, lib_path: str = None): - """ - Initialize high-performance CUDA accelerator - - Args: - lib_path: Path to compiled optimized CUDA library (.so file) - """ - self.lib_path = lib_path or self._find_optimized_cuda_lib() - self.lib = None - self.initialized = False - - try: - self.lib = ctypes.CDLL(self.lib_path) - self._setup_function_signatures() - self.initialized = True - print(f"āœ… High-Performance CUDA ZK Accelerator initialized: {self.lib_path}") - except Exception as e: - print(f"āŒ Failed to initialize CUDA accelerator: {e}") - self.initialized = False - - def _find_optimized_cuda_lib(self) -> str: - """Find the compiled optimized CUDA library""" - possible_paths = [ - "./liboptimized_field_operations.so", - "./optimized_field_operations.so", - "../liboptimized_field_operations.so", - "../../liboptimized_field_operations.so", - "/usr/local/lib/liboptimized_field_operations.so" - ] - - for path in possible_paths: - if os.path.exists(path): - return path - - raise FileNotFoundError("Optimized CUDA library not found. Please compile optimized_field_operations.cu first.") - - def _setup_function_signatures(self): - """Setup function signatures for optimized CUDA library functions""" - if not self.lib: - return - - # Initialize optimized CUDA device - self.lib.init_optimized_cuda_device.argtypes = [] - self.lib.init_optimized_cuda_device.restype = ctypes.c_int - - # Optimized field addition with flat arrays - self.lib.gpu_optimized_field_addition.argtypes = [ - np.ctypeslib.ndpointer(ctypes.c_uint64, flags="C_CONTIGUOUS"), - np.ctypeslib.ndpointer(ctypes.c_uint64, flags="C_CONTIGUOUS"), - np.ctypeslib.ndpointer(ctypes.c_uint64, flags="C_CONTIGUOUS"), - np.ctypeslib.ndpointer(ctypes.c_uint64, flags="C_CONTIGUOUS"), - ctypes.c_int - ] - self.lib.gpu_optimized_field_addition.restype = ctypes.c_int - - # Vectorized field addition - self.lib.gpu_vectorized_field_addition.argtypes = [ - np.ctypeslib.ndpointer(ctypes.c_uint64, flags="C_CONTIGUOUS"), # field_vector_t - np.ctypeslib.ndpointer(ctypes.c_uint64, flags="C_CONTIGUOUS"), - np.ctypeslib.ndpointer(ctypes.c_uint64, flags="C_CONTIGUOUS"), - np.ctypeslib.ndpointer(ctypes.c_uint64, flags="C_CONTIGUOUS"), - ctypes.c_int - ] - self.lib.gpu_vectorized_field_addition.restype = ctypes.c_int - - # Shared memory field addition - self.lib.gpu_shared_memory_field_addition.argtypes = [ - np.ctypeslib.ndpointer(ctypes.c_uint64, flags="C_CONTIGUOUS"), - np.ctypeslib.ndpointer(ctypes.c_uint64, flags="C_CONTIGUOUS"), - np.ctypeslib.ndpointer(ctypes.c_uint64, flags="C_CONTIGUOUS"), - np.ctypeslib.ndpointer(ctypes.c_uint64, flags="C_CONTIGUOUS"), - ctypes.c_int - ] - self.lib.gpu_shared_memory_field_addition.restype = ctypes.c_int - - def init_device(self) -> bool: - """Initialize optimized CUDA device and check capabilities""" - if not self.initialized: - print("āŒ CUDA accelerator not initialized") - return False - - try: - result = self.lib.init_optimized_cuda_device() - if result == 0: - print("āœ… Optimized CUDA device initialized successfully") - return True - else: - print(f"āŒ CUDA device initialization failed: {result}") - return False - except Exception as e: - print(f"āŒ CUDA device initialization error: {e}") - return False - - def benchmark_optimized_kernels(self, max_elements: int = 10000000) -> dict: - """ - Benchmark all optimized CUDA kernels and compare performance - - Args: - max_elements: Maximum number of elements to test - - Returns: - Comprehensive performance benchmark results - """ - if not self.initialized: - return {"error": "CUDA accelerator not initialized"} - - print(f"šŸš€ High-Performance CUDA Kernel Benchmark (up to {max_elements:,} elements)") - print("=" * 80) - - # Test different dataset sizes - test_sizes = [ - 1000, # 1K elements - 10000, # 10K elements - 100000, # 100K elements - 1000000, # 1M elements - 5000000, # 5M elements - 10000000, # 10M elements - ] - - results = { - "test_sizes": [], - "optimized_flat": [], - "vectorized": [], - "shared_memory": [], - "cpu_baseline": [], - "performance_summary": {} - } - - for size in test_sizes: - if size > max_elements: - break - - print(f"\nšŸ“Š Benchmarking {size:,} elements...") - - # Generate test data as flat arrays for optimal memory access - a_flat, b_flat = self._generate_flat_test_data(size) - - # bn128 field modulus (simplified) - modulus = [0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF] - - # Benchmark optimized flat array kernel - flat_result = self._benchmark_optimized_flat_kernel(a_flat, b_flat, modulus, size) - - # Benchmark vectorized kernel - vec_result = self._benchmark_vectorized_kernel(a_flat, b_flat, modulus, size) - - # Benchmark shared memory kernel - shared_result = self._benchmark_shared_memory_kernel(a_flat, b_flat, modulus, size) - - # Benchmark CPU baseline - cpu_result = self._benchmark_cpu_baseline(a_flat, b_flat, modulus, size) - - # Store results - results["test_sizes"].append(size) - results["optimized_flat"].append(flat_result) - results["vectorized"].append(vec_result) - results["shared_memory"].append(shared_result) - results["cpu_baseline"].append(cpu_result) - - # Print comparison - print(f" Optimized Flat: {flat_result['time']:.4f}s, {flat_result['throughput']:.0f} elem/s") - print(f" Vectorized: {vec_result['time']:.4f}s, {vec_result['throughput']:.0f} elem/s") - print(f" Shared Memory: {shared_result['time']:.4f}s, {shared_result['throughput']:.0f} elem/s") - print(f" CPU Baseline: {cpu_result['time']:.4f}s, {cpu_result['throughput']:.0f} elem/s") - - # Calculate speedups - flat_speedup = cpu_result['time'] / flat_result['time'] if flat_result['time'] > 0 else 0 - vec_speedup = cpu_result['time'] / vec_result['time'] if vec_result['time'] > 0 else 0 - shared_speedup = cpu_result['time'] / shared_result['time'] if shared_result['time'] > 0 else 0 - - print(f" Speedups - Flat: {flat_speedup:.2f}x, Vec: {vec_speedup:.2f}x, Shared: {shared_speedup:.2f}x") - - # Calculate performance summary - results["performance_summary"] = self._calculate_performance_summary(results) - - # Print final summary - self._print_performance_summary(results["performance_summary"]) - - return results - - def _benchmark_optimized_flat_kernel(self, a_flat: np.ndarray, b_flat: np.ndarray, - modulus: List[int], num_elements: int) -> dict: - """Benchmark optimized flat array kernel""" - try: - result_flat = np.zeros_like(a_flat) - modulus_array = np.array(modulus, dtype=np.uint64) - - # Multiple runs for consistency - times = [] - for run in range(3): - start_time = time.time() - success = self.lib.gpu_optimized_field_addition( - a_flat, b_flat, result_flat, modulus_array, num_elements - ) - run_time = time.time() - start_time - - if success == 0: # Success - times.append(run_time) - - if not times: - return {"time": float('inf'), "throughput": 0, "success": False} - - avg_time = sum(times) / len(times) - throughput = num_elements / avg_time if avg_time > 0 else 0 - - return {"time": avg_time, "throughput": throughput, "success": True} - - except Exception as e: - print(f" āŒ Optimized flat kernel error: {e}") - return {"time": float('inf'), "throughput": 0, "success": False} - - def _benchmark_vectorized_kernel(self, a_flat: np.ndarray, b_flat: np.ndarray, - modulus: List[int], num_elements: int) -> dict: - """Benchmark vectorized kernel""" - try: - # Convert flat arrays to vectorized format (uint4) - # For simplicity, we'll reuse the flat array kernel as vectorized - # In practice, would convert to proper vector format - result_flat = np.zeros_like(a_flat) - modulus_array = np.array(modulus, dtype=np.uint64) - - times = [] - for run in range(3): - start_time = time.time() - success = self.lib.gpu_vectorized_field_addition( - a_flat, b_flat, result_flat, modulus_array, num_elements - ) - run_time = time.time() - start_time - - if success == 0: - times.append(run_time) - - if not times: - return {"time": float('inf'), "throughput": 0, "success": False} - - avg_time = sum(times) / len(times) - throughput = num_elements / avg_time if avg_time > 0 else 0 - - return {"time": avg_time, "throughput": throughput, "success": True} - - except Exception as e: - print(f" āŒ Vectorized kernel error: {e}") - return {"time": float('inf'), "throughput": 0, "success": False} - - def _benchmark_shared_memory_kernel(self, a_flat: np.ndarray, b_flat: np.ndarray, - modulus: List[int], num_elements: int) -> dict: - """Benchmark shared memory kernel""" - try: - result_flat = np.zeros_like(a_flat) - modulus_array = np.array(modulus, dtype=np.uint64) - - times = [] - for run in range(3): - start_time = time.time() - success = self.lib.gpu_shared_memory_field_addition( - a_flat, b_flat, result_flat, modulus_array, num_elements - ) - run_time = time.time() - start_time - - if success == 0: - times.append(run_time) - - if not times: - return {"time": float('inf'), "throughput": 0, "success": False} - - avg_time = sum(times) / len(times) - throughput = num_elements / avg_time if avg_time > 0 else 0 - - return {"time": avg_time, "throughput": throughput, "success": True} - - except Exception as e: - print(f" āŒ Shared memory kernel error: {e}") - return {"time": float('inf'), "throughput": 0, "success": False} - - def _benchmark_cpu_baseline(self, a_flat: np.ndarray, b_flat: np.ndarray, - modulus: List[int], num_elements: int) -> dict: - """Benchmark CPU baseline for comparison""" - try: - start_time = time.time() - - # Simple CPU field addition - result_flat = np.zeros_like(a_flat) - for i in range(num_elements): - base_idx = i * 4 - for j in range(4): - result_flat[base_idx + j] = (a_flat[base_idx + j] + b_flat[base_idx + j]) % modulus[j] - - cpu_time = time.time() - start_time - throughput = num_elements / cpu_time if cpu_time > 0 else 0 - - return {"time": cpu_time, "throughput": throughput, "success": True} - - except Exception as e: - print(f" āŒ CPU baseline error: {e}") - return {"time": float('inf'), "throughput": 0, "success": False} - - def _generate_flat_test_data(self, num_elements: int) -> Tuple[np.ndarray, np.ndarray]: - """Generate flat array test data for optimal memory access""" - # Generate flat arrays (num_elements * 4 limbs) - flat_size = num_elements * 4 - - # Use numpy for fast generation - a_flat = np.random.randint(0, 2**32, size=flat_size, dtype=np.uint64) - b_flat = np.random.randint(0, 2**32, size=flat_size, dtype=np.uint64) - - return a_flat, b_flat - - def _calculate_performance_summary(self, results: dict) -> dict: - """Calculate performance summary statistics""" - summary = {} - - # Find best performing kernel for each size - best_speedups = [] - best_throughputs = [] - - for i, size in enumerate(results["test_sizes"]): - cpu_time = results["cpu_baseline"][i]["time"] - - # Calculate speedups - flat_speedup = cpu_time / results["optimized_flat"][i]["time"] if results["optimized_flat"][i]["time"] > 0 else 0 - vec_speedup = cpu_time / results["vectorized"][i]["time"] if results["vectorized"][i]["time"] > 0 else 0 - shared_speedup = cpu_time / results["shared_memory"][i]["time"] if results["shared_memory"][i]["time"] > 0 else 0 - - best_speedup = max(flat_speedup, vec_speedup, shared_speedup) - best_speedups.append(best_speedup) - - # Find best throughput - best_throughput = max( - results["optimized_flat"][i]["throughput"], - results["vectorized"][i]["throughput"], - results["shared_memory"][i]["throughput"] - ) - best_throughputs.append(best_throughput) - - if best_speedups: - summary["best_speedup"] = max(best_speedups) - summary["average_speedup"] = sum(best_speedups) / len(best_speedups) - summary["best_speedup_size"] = results["test_sizes"][best_speedups.index(max(best_speedups))] - - if best_throughputs: - summary["best_throughput"] = max(best_throughputs) - summary["average_throughput"] = sum(best_throughputs) / len(best_throughputs) - summary["best_throughput_size"] = results["test_sizes"][best_throughputs.index(max(best_throughputs))] - - return summary - - def _print_performance_summary(self, summary: dict): - """Print comprehensive performance summary""" - print(f"\nšŸŽÆ High-Performance CUDA Summary:") - print("=" * 50) - - if "best_speedup" in summary: - print(f" Best Speedup: {summary['best_speedup']:.2f}x at {summary.get('best_speedup_size', 'N/A'):,} elements") - print(f" Average Speedup: {summary['average_speedup']:.2f}x across all tests") - - if "best_throughput" in summary: - print(f" Best Throughput: {summary['best_throughput']:.0f} elements/s at {summary.get('best_throughput_size', 'N/A'):,} elements") - print(f" Average Throughput: {summary['average_throughput']:.0f} elements/s") - - # Performance classification - if summary.get("best_speedup", 0) > 5: - print(" šŸš€ Performance: EXCELLENT - Significant GPU acceleration achieved") - elif summary.get("best_speedup", 0) > 2: - print(" āœ… Performance: GOOD - Measurable GPU acceleration achieved") - elif summary.get("best_speedup", 0) > 1: - print(" āš ļø Performance: MODERATE - Limited GPU acceleration") - else: - print(" āŒ Performance: POOR - No significant GPU acceleration") - - def analyze_memory_bandwidth(self, num_elements: int = 1000000) -> dict: - """Analyze memory bandwidth performance""" - print(f"šŸ” Analyzing Memory Bandwidth Performance ({num_elements:,} elements)...") - - a_flat, b_flat = self._generate_flat_test_data(num_elements) - modulus = [0xFFFFFFFFFFFFFFFF] * 4 - - # Test different kernels - flat_result = self._benchmark_optimized_flat_kernel(a_flat, b_flat, modulus, num_elements) - vec_result = self._benchmark_vectorized_kernel(a_flat, b_flat, modulus, num_elements) - shared_result = self._benchmark_shared_memory_kernel(a_flat, b_flat, modulus, num_elements) - - # Calculate theoretical bandwidth - data_size = num_elements * 4 * 8 * 3 # 3 arrays, 4 limbs, 8 bytes - - analysis = { - "data_size_gb": data_size / (1024**3), - "flat_bandwidth_gb_s": data_size / (flat_result['time'] * 1024**3) if flat_result['time'] > 0 else 0, - "vectorized_bandwidth_gb_s": data_size / (vec_result['time'] * 1024**3) if vec_result['time'] > 0 else 0, - "shared_bandwidth_gb_s": data_size / (shared_result['time'] * 1024**3) if shared_result['time'] > 0 else 0, - } - - print(f" Data Size: {analysis['data_size_gb']:.2f} GB") - print(f" Flat Kernel: {analysis['flat_bandwidth_gb_s']:.2f} GB/s") - print(f" Vectorized Kernel: {analysis['vectorized_bandwidth_gb_s']:.2f} GB/s") - print(f" Shared Memory Kernel: {analysis['shared_bandwidth_gb_s']:.2f} GB/s") - - return analysis - -def main(): - """Main function for testing high-performance CUDA acceleration""" - print("šŸš€ AITBC High-Performance CUDA ZK Accelerator Test") - print("=" * 60) - - try: - # Initialize high-performance accelerator - accelerator = HighPerformanceCUDAZKAccelerator() - - if not accelerator.initialized: - print("āŒ Failed to initialize CUDA accelerator") - return - - # Initialize device - if not accelerator.init_device(): - return - - # Run comprehensive benchmark - results = accelerator.benchmark_optimized_kernels(10000000) - - # Analyze memory bandwidth - bandwidth_analysis = accelerator.analyze_memory_bandwidth(1000000) - - print("\nāœ… High-Performance CUDA acceleration test completed!") - - if results.get("performance_summary", {}).get("best_speedup", 0) > 1: - print(f"šŸš€ Optimization successful: {results['performance_summary']['best_speedup']:.2f}x speedup achieved") - else: - print("āš ļø Further optimization needed") - - except Exception as e: - print(f"āŒ Test failed: {e}") - -if __name__ == "__main__": - main() diff --git a/dev/gpu_acceleration/legacy/production_cuda_zk_api.py b/dev/gpu_acceleration/legacy/production_cuda_zk_api.py deleted file mode 100755 index f808d9b4..00000000 --- a/dev/gpu_acceleration/legacy/production_cuda_zk_api.py +++ /dev/null @@ -1,609 +0,0 @@ -#!/usr/bin/env python3 -""" -Production-Ready CUDA ZK Accelerator API -Integrates optimized CUDA kernels with AITBC ZK workflow and Coordinator API -""" - -import os -import sys -import json -import time -import logging -import asyncio -from typing import Dict, List, Optional, Tuple, Any -from dataclasses import dataclass, asdict -from pathlib import Path -import numpy as np - -# Configure CUDA library paths before importing CUDA modules -import os -os.environ['LD_LIBRARY_PATH'] = '/usr/lib/x86_64-linux-gnu:/usr/local/cuda/lib64' - -# Add CUDA accelerator path -sys.path.append('/home/oib/windsurf/aitbc/gpu_acceleration') - -try: - from high_performance_cuda_accelerator import HighPerformanceCUDAZKAccelerator - CUDA_AVAILABLE = True -except ImportError as e: - CUDA_AVAILABLE = False - print(f"āš ļø CUDA accelerator import failed: {e}") - print(" Falling back to CPU operations") - -# Configure logging -logging.basicConfig( - level=logging.INFO, - format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' -) -logger = logging.getLogger("CUDA_ZK_API") - -@dataclass -class ZKOperationRequest: - """Request structure for ZK operations""" - operation_type: str # 'field_addition', 'constraint_verification', 'witness_generation' - circuit_data: Dict[str, Any] - witness_data: Optional[Dict[str, Any]] = None - constraints: Optional[List[Dict[str, Any]]] = None - optimization_level: str = "high" # 'low', 'medium', 'high' - use_gpu: bool = True - timeout_seconds: int = 300 - -@dataclass -class ZKOperationResult: - """Result structure for ZK operations""" - success: bool - operation_type: str - execution_time: float - gpu_used: bool - speedup: Optional[float] = None - throughput: Optional[float] = None - result_data: Optional[Dict[str, Any]] = None - error_message: Optional[str] = None - performance_metrics: Optional[Dict[str, Any]] = None - -class ProductionCUDAZKAPI: - """Production-ready CUDA ZK Accelerator API""" - - def __init__(self): - """Initialize the production CUDA ZK API""" - self.cuda_accelerator = None - self.initialized = False - self.performance_cache = {} - self.operation_stats = { - "total_operations": 0, - "gpu_operations": 0, - "cpu_operations": 0, - "total_time": 0.0, - "average_speedup": 0.0 - } - - # Initialize CUDA accelerator - self._initialize_cuda_accelerator() - - logger.info("šŸš€ Production CUDA ZK API initialized") - logger.info(f" CUDA Available: {CUDA_AVAILABLE}") - logger.info(f" GPU Accelerator: {'Ready' if self.cuda_accelerator else 'Not Available'}") - - def _initialize_cuda_accelerator(self): - """Initialize CUDA accelerator if available""" - if not CUDA_AVAILABLE: - logger.warning("CUDA not available, using CPU-only operations") - return - - try: - self.cuda_accelerator = HighPerformanceCUDAZKAccelerator() - if self.cuda_accelerator.init_device(): - self.initialized = True - logger.info("āœ… CUDA accelerator initialized successfully") - else: - logger.error("āŒ Failed to initialize CUDA device") - self.cuda_accelerator = None - except Exception as e: - logger.error(f"āŒ CUDA accelerator initialization failed: {e}") - self.cuda_accelerator = None - - async def process_zk_operation(self, request: ZKOperationRequest) -> ZKOperationResult: - """ - Process a ZK operation with GPU acceleration - - Args: - request: ZK operation request - - Returns: - ZK operation result - """ - start_time = time.time() - operation_type = request.operation_type - - logger.info(f"šŸ”„ Processing {operation_type} operation") - logger.info(f" GPU Requested: {request.use_gpu}") - logger.info(f" Optimization Level: {request.optimization_level}") - - try: - # Update statistics - self.operation_stats["total_operations"] += 1 - - # Process operation based on type - if operation_type == "field_addition": - result = await self._process_field_addition(request) - elif operation_type == "constraint_verification": - result = await self._process_constraint_verification(request) - elif operation_type == "witness_generation": - result = await self._process_witness_generation(request) - else: - result = ZKOperationResult( - success=False, - operation_type=operation_type, - execution_time=time.time() - start_time, - gpu_used=False, - error_message=f"Unsupported operation type: {operation_type}" - ) - - # Update statistics - execution_time = time.time() - start_time - self.operation_stats["total_time"] += execution_time - - if result.gpu_used: - self.operation_stats["gpu_operations"] += 1 - if result.speedup: - self._update_average_speedup(result.speedup) - else: - self.operation_stats["cpu_operations"] += 1 - - logger.info(f"āœ… Operation completed in {execution_time:.4f}s") - if result.speedup: - logger.info(f" Speedup: {result.speedup:.2f}x") - - return result - - except Exception as e: - logger.error(f"āŒ Operation failed: {e}") - return ZKOperationResult( - success=False, - operation_type=operation_type, - execution_time=time.time() - start_time, - gpu_used=False, - error_message=str(e) - ) - - async def _process_field_addition(self, request: ZKOperationRequest) -> ZKOperationResult: - """Process field addition operation""" - start_time = time.time() - - # Extract field data from request - circuit_data = request.circuit_data - num_elements = circuit_data.get("num_elements", 1000) - - # Generate test data (in production, would use actual circuit data) - a_flat, b_flat = self._generate_field_data(num_elements) - modulus = circuit_data.get("modulus", [0xFFFFFFFFFFFFFFFF] * 4) - - gpu_used = False - speedup = None - throughput = None - performance_metrics = None - - if request.use_gpu and self.cuda_accelerator and self.initialized: - # Use GPU acceleration - try: - gpu_result = self.cuda_accelerator._benchmark_optimized_flat_kernel( - a_flat, b_flat, modulus, num_elements - ) - - if gpu_result["success"]: - gpu_used = True - gpu_time = gpu_result["time"] - throughput = gpu_result["throughput"] - - # Compare with CPU baseline - cpu_time = self._cpu_field_addition_time(num_elements) - speedup = cpu_time / gpu_time if gpu_time > 0 else 0 - - performance_metrics = { - "gpu_time": gpu_time, - "cpu_time": cpu_time, - "memory_bandwidth": self._estimate_memory_bandwidth(num_elements, gpu_time), - "gpu_utilization": self._estimate_gpu_utilization(num_elements) - } - - logger.info(f"šŸš€ GPU field addition completed") - logger.info(f" GPU Time: {gpu_time:.4f}s") - logger.info(f" CPU Time: {cpu_time:.4f}s") - logger.info(f" Speedup: {speedup:.2f}x") - - else: - logger.warning("GPU operation failed, falling back to CPU") - - except Exception as e: - logger.warning(f"GPU operation failed: {e}, falling back to CPU") - - # CPU fallback - if not gpu_used: - cpu_time = self._cpu_field_addition_time(num_elements) - throughput = num_elements / cpu_time if cpu_time > 0 else 0 - performance_metrics = { - "cpu_time": cpu_time, - "cpu_throughput": throughput - } - - execution_time = time.time() - start_time - - return ZKOperationResult( - success=True, - operation_type="field_addition", - execution_time=execution_time, - gpu_used=gpu_used, - speedup=speedup, - throughput=throughput, - result_data={"num_elements": num_elements}, - performance_metrics=performance_metrics - ) - - async def _process_constraint_verification(self, request: ZKOperationRequest) -> ZKOperationResult: - """Process constraint verification operation""" - start_time = time.time() - - # Extract constraint data - constraints = request.constraints or [] - num_constraints = len(constraints) - - if num_constraints == 0: - # Generate test constraints - num_constraints = request.circuit_data.get("num_constraints", 1000) - constraints = self._generate_test_constraints(num_constraints) - - gpu_used = False - speedup = None - throughput = None - performance_metrics = None - - if request.use_gpu and self.cuda_accelerator and self.initialized: - try: - # Use GPU for constraint verification - gpu_time = self._gpu_constraint_verification_time(num_constraints) - gpu_used = True - throughput = num_constraints / gpu_time if gpu_time > 0 else 0 - - # Compare with CPU - cpu_time = self._cpu_constraint_verification_time(num_constraints) - speedup = cpu_time / gpu_time if gpu_time > 0 else 0 - - performance_metrics = { - "gpu_time": gpu_time, - "cpu_time": cpu_time, - "constraints_verified": num_constraints, - "verification_rate": throughput - } - - logger.info(f"šŸš€ GPU constraint verification completed") - logger.info(f" Constraints: {num_constraints}") - logger.info(f" Speedup: {speedup:.2f}x") - - except Exception as e: - logger.warning(f"GPU constraint verification failed: {e}, falling back to CPU") - - # CPU fallback - if not gpu_used: - cpu_time = self._cpu_constraint_verification_time(num_constraints) - throughput = num_constraints / cpu_time if cpu_time > 0 else 0 - performance_metrics = { - "cpu_time": cpu_time, - "constraints_verified": num_constraints, - "verification_rate": throughput - } - - execution_time = time.time() - start_time - - return ZKOperationResult( - success=True, - operation_type="constraint_verification", - execution_time=execution_time, - gpu_used=gpu_used, - speedup=speedup, - throughput=throughput, - result_data={"num_constraints": num_constraints}, - performance_metrics=performance_metrics - ) - - async def _process_witness_generation(self, request: ZKOperationRequest) -> ZKOperationResult: - """Process witness generation operation""" - start_time = time.time() - - # Extract witness data - witness_data = request.witness_data or {} - num_inputs = witness_data.get("num_inputs", 1000) - witness_size = witness_data.get("witness_size", 10000) - - gpu_used = False - speedup = None - throughput = None - performance_metrics = None - - if request.use_gpu and self.cuda_accelerator and self.initialized: - try: - # Use GPU for witness generation - gpu_time = self._gpu_witness_generation_time(num_inputs, witness_size) - gpu_used = True - throughput = witness_size / gpu_time if gpu_time > 0 else 0 - - # Compare with CPU - cpu_time = self._cpu_witness_generation_time(num_inputs, witness_size) - speedup = cpu_time / gpu_time if gpu_time > 0 else 0 - - performance_metrics = { - "gpu_time": gpu_time, - "cpu_time": cpu_time, - "witness_size": witness_size, - "generation_rate": throughput - } - - logger.info(f"šŸš€ GPU witness generation completed") - logger.info(f" Witness Size: {witness_size}") - logger.info(f" Speedup: {speedup:.2f}x") - - except Exception as e: - logger.warning(f"GPU witness generation failed: {e}, falling back to CPU") - - # CPU fallback - if not gpu_used: - cpu_time = self._cpu_witness_generation_time(num_inputs, witness_size) - throughput = witness_size / cpu_time if cpu_time > 0 else 0 - performance_metrics = { - "cpu_time": cpu_time, - "witness_size": witness_size, - "generation_rate": throughput - } - - execution_time = time.time() - start_time - - return ZKOperationResult( - success=True, - operation_type="witness_generation", - execution_time=execution_time, - gpu_used=gpu_used, - speedup=speedup, - throughput=throughput, - result_data={"witness_size": witness_size}, - performance_metrics=performance_metrics - ) - - def _generate_field_data(self, num_elements: int) -> Tuple[np.ndarray, np.ndarray]: - """Generate field test data""" - flat_size = num_elements * 4 - a_flat = np.random.randint(0, 2**32, size=flat_size, dtype=np.uint64) - b_flat = np.random.randint(0, 2**32, size=flat_size, dtype=np.uint64) - return a_flat, b_flat - - def _generate_test_constraints(self, num_constraints: int) -> List[Dict[str, Any]]: - """Generate test constraints""" - constraints = [] - for i in range(num_constraints): - constraint = { - "a": [np.random.randint(0, 2**32) for _ in range(4)], - "b": [np.random.randint(0, 2**32) for _ in range(4)], - "c": [np.random.randint(0, 2**32) for _ in range(4)], - "operation": np.random.choice([0, 1]) - } - constraints.append(constraint) - return constraints - - def _cpu_field_addition_time(self, num_elements: int) -> float: - """Estimate CPU field addition time""" - # Based on benchmark: ~725K elements/s for CPU - return num_elements / 725000 - - def _gpu_field_addition_time(self, num_elements: int) -> float: - """Estimate GPU field addition time""" - # Based on benchmark: ~120M elements/s for GPU - return num_elements / 120000000 - - def _cpu_constraint_verification_time(self, num_constraints: int) -> float: - """Estimate CPU constraint verification time""" - # Based on benchmark: ~500K constraints/s for CPU - return num_constraints / 500000 - - def _gpu_constraint_verification_time(self, num_constraints: int) -> float: - """Estimate GPU constraint verification time""" - # Based on benchmark: ~100M constraints/s for GPU - return num_constraints / 100000000 - - def _cpu_witness_generation_time(self, num_inputs: int, witness_size: int) -> float: - """Estimate CPU witness generation time""" - # Based on benchmark: ~1M witness elements/s for CPU - return witness_size / 1000000 - - def _gpu_witness_generation_time(self, num_inputs: int, witness_size: int) -> float: - """Estimate GPU witness generation time""" - # Based on benchmark: ~50M witness elements/s for GPU - return witness_size / 50000000 - - def _estimate_memory_bandwidth(self, num_elements: int, gpu_time: float) -> float: - """Estimate memory bandwidth in GB/s""" - # 3 arrays * 4 limbs * 8 bytes * num_elements - data_size_gb = (3 * 4 * 8 * num_elements) / (1024**3) - return data_size_gb / gpu_time if gpu_time > 0 else 0 - - def _estimate_gpu_utilization(self, num_elements: int) -> float: - """Estimate GPU utilization percentage""" - # Based on thread count and GPU capacity - if num_elements < 1000: - return 20.0 # Low utilization for small workloads - elif num_elements < 10000: - return 60.0 # Medium utilization - elif num_elements < 100000: - return 85.0 # High utilization - else: - return 95.0 # Very high utilization for large workloads - - def _update_average_speedup(self, new_speedup: float): - """Update running average speedup""" - total_ops = self.operation_stats["gpu_operations"] - if total_ops == 1: - self.operation_stats["average_speedup"] = new_speedup - else: - current_avg = self.operation_stats["average_speedup"] - self.operation_stats["average_speedup"] = ( - (current_avg * (total_ops - 1) + new_speedup) / total_ops - ) - - def get_performance_statistics(self) -> Dict[str, Any]: - """Get comprehensive performance statistics""" - stats = self.operation_stats.copy() - - if stats["total_operations"] > 0: - stats["average_execution_time"] = stats["total_time"] / stats["total_operations"] - stats["gpu_usage_rate"] = stats["gpu_operations"] / stats["total_operations"] * 100 - stats["cpu_usage_rate"] = stats["cpu_operations"] / stats["total_operations"] * 100 - else: - stats["average_execution_time"] = 0 - stats["gpu_usage_rate"] = 0 - stats["cpu_usage_rate"] = 0 - - stats["cuda_available"] = CUDA_AVAILABLE - stats["cuda_initialized"] = self.initialized - stats["gpu_device"] = "NVIDIA GeForce RTX 4060 Ti" if self.cuda_accelerator else "N/A" - - return stats - - async def benchmark_comprehensive_performance(self, max_elements: int = 1000000) -> Dict[str, Any]: - """Run comprehensive performance benchmark""" - logger.info(f"šŸš€ Running comprehensive performance benchmark up to {max_elements:,} elements") - - benchmark_results = { - "field_addition": [], - "constraint_verification": [], - "witness_generation": [], - "summary": {} - } - - test_sizes = [1000, 10000, 100000, max_elements] - - for size in test_sizes: - logger.info(f"šŸ“Š Benchmarking {size:,} elements...") - - # Field addition benchmark - field_request = ZKOperationRequest( - operation_type="field_addition", - circuit_data={"num_elements": size}, - use_gpu=True - ) - field_result = await self.process_zk_operation(field_request) - benchmark_results["field_addition"].append({ - "size": size, - "result": asdict(field_result) - }) - - # Constraint verification benchmark - constraint_request = ZKOperationRequest( - operation_type="constraint_verification", - circuit_data={"num_constraints": size}, - use_gpu=True - ) - constraint_result = await self.process_zk_operation(constraint_request) - benchmark_results["constraint_verification"].append({ - "size": size, - "result": asdict(constraint_result) - }) - - # Witness generation benchmark - witness_request = ZKOperationRequest( - operation_type="witness_generation", - circuit_data={"num_inputs": size // 10}, # Add required circuit_data - witness_data={"num_inputs": size // 10, "witness_size": size}, - use_gpu=True - ) - witness_result = await self.process_zk_operation(witness_request) - benchmark_results["witness_generation"].append({ - "size": size, - "result": asdict(witness_result) - }) - - # Calculate summary statistics - benchmark_results["summary"] = self._calculate_benchmark_summary(benchmark_results) - - logger.info("āœ… Comprehensive benchmark completed") - return benchmark_results - - def _calculate_benchmark_summary(self, results: Dict[str, Any]) -> Dict[str, Any]: - """Calculate benchmark summary statistics""" - summary = {} - - for operation_type in ["field_addition", "constraint_verification", "witness_generation"]: - operation_results = results[operation_type] - - speedups = [r["result"]["speedup"] for r in operation_results if r["result"]["speedup"]] - throughputs = [r["result"]["throughput"] for r in operation_results if r["result"]["throughput"]] - - if speedups: - summary[f"{operation_type}_avg_speedup"] = sum(speedups) / len(speedups) - summary[f"{operation_type}_max_speedup"] = max(speedups) - - if throughputs: - summary[f"{operation_type}_avg_throughput"] = sum(throughputs) / len(throughputs) - summary[f"{operation_type}_max_throughput"] = max(throughputs) - - return summary - -# Global API instance -cuda_zk_api = ProductionCUDAZKAPI() - -async def main(): - """Main function for testing the production API""" - print("šŸš€ AITBC Production CUDA ZK API Test") - print("=" * 50) - - try: - # Test field addition - print("\nšŸ“Š Testing Field Addition...") - field_request = ZKOperationRequest( - operation_type="field_addition", - circuit_data={"num_elements": 100000}, - use_gpu=True - ) - field_result = await cuda_zk_api.process_zk_operation(field_request) - print(f" Result: {field_result.success}") - print(f" GPU Used: {field_result.gpu_used}") - print(f" Speedup: {field_result.speedup:.2f}x" if field_result.speedup else " Speedup: N/A") - - # Test constraint verification - print("\nšŸ“Š Testing Constraint Verification...") - constraint_request = ZKOperationRequest( - operation_type="constraint_verification", - circuit_data={"num_constraints": 50000}, - use_gpu=True - ) - constraint_result = await cuda_zk_api.process_zk_operation(constraint_request) - print(f" Result: {constraint_result.success}") - print(f" GPU Used: {constraint_result.gpu_used}") - print(f" Speedup: {constraint_result.speedup:.2f}x" if constraint_result.speedup else " Speedup: N/A") - - # Test witness generation - print("\nšŸ“Š Testing Witness Generation...") - witness_request = ZKOperationRequest( - operation_type="witness_generation", - circuit_data={"num_inputs": 1000}, # Add required circuit_data - witness_data={"num_inputs": 1000, "witness_size": 50000}, - use_gpu=True - ) - witness_result = await cuda_zk_api.process_zk_operation(witness_request) - print(f" Result: {witness_result.success}") - print(f" GPU Used: {witness_result.gpu_used}") - print(f" Speedup: {witness_result.speedup:.2f}x" if witness_result.speedup else " Speedup: N/A") - - # Get performance statistics - print("\nšŸ“Š Performance Statistics:") - stats = cuda_zk_api.get_performance_statistics() - for key, value in stats.items(): - print(f" {key}: {value}") - - # Run comprehensive benchmark - print("\nšŸš€ Running Comprehensive Benchmark...") - benchmark_results = await cuda_zk_api.benchmark_comprehensive_performance(100000) - - print("\nāœ… Production API test completed successfully!") - - except Exception as e: - print(f"āŒ Test failed: {e}") - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/dev/gpu_acceleration/legacy/marketplace_gpu_optimizer.py b/dev/gpu_acceleration/parallel_processing/marketplace_gpu_optimizer.py similarity index 100% rename from dev/gpu_acceleration/legacy/marketplace_gpu_optimizer.py rename to dev/gpu_acceleration/parallel_processing/marketplace_gpu_optimizer.py